diff options
Diffstat (limited to 'wiki2text.py')
-rw-r--r-- | wiki2text.py | 177 |
1 files changed, 116 insertions, 61 deletions
diff --git a/wiki2text.py b/wiki2text.py index f28c343..c41c4e0 100644 --- a/wiki2text.py +++ b/wiki2text.py @@ -23,13 +23,13 @@ import urllib class TextWikiMarkup (WikiMarkup): """ A (general-purpose Wiki->Text translator class. """ # Output width - width = 80 + width = 78 # Do not show references. references = False # Provide a minimum markup markup = True # Number of current element in the environment @@ -54,28 +54,32 @@ class TextWikiMarkup (WikiMarkup): if str in wiki_ns[self.lang]: return wiki_ns[self.lang][str] elif str in wiki_ns_re[self.lang]: for elt in wiki_ns_re[self.lang][str]: if str.beginswith(elt[0]) and str.endswith(elt[1]): return elt[2] - return None + return None + def mktgt(self, tgt, lang = None): if not lang: lang = self.lang return self.html_base % { 'lang' : lang } + urllib.quote(tgt) - def link(self, tok, env, istmpl): - arg = self.fmtok(tok[1], env) - text = self.fmtok(tok[2], env) + def fmtlink(self, elt, istmpl): + arg = self.format(elt[1][0]) + if len(elt[1]) > 1: + text = self.format(elt[1][1]) + else: + text = None (qual,sep,tgt) = arg.partition(':') if tgt != '': ns = self.wiki_ns_name(qual) if ns: if ns == 'NS_IMAGE': if not self.references: - return None + return "" text = "[%s: %s]" % (qual, text if text else arg) tgt = self.image_base + '/' + \ urllib.quote(tgt) + \ '/250px-' + urllib.quote(tgt) elif ns == 'NS_MEDIA': text = "[%s]" % (qual) @@ -91,82 +95,133 @@ class TextWikiMarkup (WikiMarkup): if self.references: return "%s (see %s) " % (text, tgt) elif not text or text == '': return arg else: return text - - def str_link(self, tok, env): - return self.link(tok, env, False) - - def str_tmpl(self, tok, env): - return self.link(tok, env, True) - - def str_ref(self, tok, env): - return self.xref(self.fmtok(tok[2], env), self.fmtok(tok[1], env)) - - def str_it(self, tok, env): - if self.markup: - return "_" + self.fmtok(tok[1], env) + "_" - return self.fmtok(tok[1], env); - - def str_bold(self, tok, env): - if self.markup: - return self.fmtok(tok[1], env).upper() - return self.fmtok(tok[1], env); - - def str_hdr(self, tok, env): - level = tok[1] - return "\n\n" + ("*" * level) + " " + self.fmtok(tok[2], env) + "\n\n" - - def str_bar(self, tok, env): - w = self.width - if w < 5: - w = 5 - return "\n" + ("-" * (w - 5)).center(w - 1) + "\n" - - def str_env(self, tok, env): - self.num = 1 - return "\n" + self.fmtok(tok[3], tok) def indent (self, lev, text): + print "T \"",text,"\"" w = self.width self.width = w - lev if text.find('\n') == -1: s = (" " * lev) + text else: s = "" for elt in text.split('\n'): - s += (" " * lev) + elt - if elt == '': - s += "\n" + s += (" " * lev) + elt + '\n' self.width = w return s + + def fmtpara(self, input): + output = "" + linebuf = "" + length = 0 + for s in input.split(): + wlen = len(s) + if linebuf.endswith("."): + wsc = 2 + else: + wsc = 1 + if length + wsc + wlen > self.width: + # FIXME: fill out linebuf + output += linebuf + '\n' + wsc = 0 + length = 0 + linebuf = "" + linebuf += " " * wsc + s + length += wsc + wlen + return output + linebuf - def str_item(self, tok, env): - t = env[1] - lev = env[2] - if lev > self.width - 4: - lev = 1 - if t == self.INDENT: - return self.indent(lev, self.fmtok(tok[1], env)) - elif t == self.ENVNUM: - n = self.num - self.num += 1 - return "" + self.indent(lev, - "%d. %s" % (n, self.fmtok(tok[1], env))) - elif t == self.ENVUNNUM: - return "" + self.indent(lev, - "- " + self.fmtok(tok[1], env)) + def fmtelt(self, elt, indent=0): + if elt[0] == TEXT: + if isinstance(elt[1],list): + string = "" + for s in elt[1]: + if string: + if string.endswith("."): + string += " " + else: + string += " " + string += s.rstrip(" ") + else: + string = elt[1] + elif elt[0] == PARA: + string = ""; + for x in elt[1]: + string += self.format(x) + string = self.fmtpara(string) + '\n\n' + elif elt[0] == IT: + string = "" + for x in elt[1]: + s = self.format(x) + if s: + string += " " + s.rstrip(" ") + string = "_" + string.lstrip(" ") + "_" + elif elt[0] == BOLD: + string = "" + for x in elt[1]: + s = self.format(x) + if s: + if string.endswith("."): + string += " " + else: + string += " " + string += s.rstrip(" ") + string = string.upper() + elif elt[0] == LINK: + string = self.fmtlink(elt, False) + elif elt[0] == TMPL: + string = '\n' + self.fmtlink(elt, True) + '\n' + elif elt[0] == BAR: + w = self.width + if w < 5: + w = 5 + string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n" + elif elt[0] == HDR: + level = elt[1] + string = "\n" + ("*" * level) + " " + \ + self.format(elt[2]).lstrip(" ") + "\n\n" + elif elt[0] == REF: + string = self.xref(self.format(elt[2]), elt[1]) + elif elt[0] == ENV: + type = elt[1] + lev = elt[2] + if lev > self.width - 4: + lev = 1 + string = "\n" + n = 1 + for s in elt[3]: + x = self.format(s) +# print "X",x + if type == ENVUNNUM: + string += self.indent(lev, "*" + x.lstrip(" ")) + '\n' + elif type == ENVNUM: + string += self.indent(lev, "%d. %s" % (n, x)) + '\n' + n += 1 + elif elt[0] == IND: + string = (" " * elt[1]) + self.format(elt[2]) + '\n' + else: + string = str(elt) + return string + + def format(self, elt, indent=0): + string = "" + if elt[0] == SEQ: + for x in elt[1]: + string += " " + self.format(x, indent) + else: + string += " " + self.fmtelt(elt, indent) + return string - def str_para(self, tok, env): - return "\n" - def __str__(self): - return self.fmtok(self.tree, None) + str = "" + for elt in self.tree: + str += self.format(elt) + return str class TextWiktionaryMarkup (TextWikiMarkup): """ See documentation for HtmlWiktionaryMarkup """ |