summaryrefslogtreecommitdiffabout
path: root/wiki2text.py
authorSergey Poznyakoff <gray@gnu.org.ua>2009-03-02 18:58:09 (GMT)
committer Sergey Poznyakoff <gray@gnu.org.ua>2009-03-02 22:31:18 (GMT)
commitae8b8bc81eab08b2ebe9f8c0957c085b5d45fc2b (patch) (side-by-side diff)
tree551e90f993a83674faa367b776538c44704e78a6 /wiki2text.py
parent86ee544f442aa3c4a0516a620890ec64de0770cc (diff)
downloadwikitrans-ae8b8bc81eab08b2ebe9f8c0957c085b5d45fc2b.tar.gz
wikitrans-ae8b8bc81eab08b2ebe9f8c0957c085b5d45fc2b.tar.bz2
Rewrite from scratch. Text conversion almost(TM) works
Diffstat (limited to 'wiki2text.py') (more/less context) (ignore whitespace changes)
-rw-r--r--wiki2text.py177
1 files changed, 116 insertions, 61 deletions
diff --git a/wiki2text.py b/wiki2text.py
index f28c343..c41c4e0 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -26,7 +26,7 @@ class TextWikiMarkup (WikiMarkup):
"""
# Output width
- width = 80
+ width = 78
# Do not show references.
references = False
# Provide a minimum markup
@@ -57,22 +57,26 @@ class TextWikiMarkup (WikiMarkup):
for elt in wiki_ns_re[self.lang][str]:
if str.beginswith(elt[0]) and str.endswith(elt[1]):
return elt[2]
- return None
+ return None
+
def mktgt(self, tgt, lang = None):
if not lang:
lang = self.lang
return self.html_base % { 'lang' : lang } + urllib.quote(tgt)
- def link(self, tok, env, istmpl):
- arg = self.fmtok(tok[1], env)
- text = self.fmtok(tok[2], env)
+ def fmtlink(self, elt, istmpl):
+ arg = self.format(elt[1][0])
+ if len(elt[1]) > 1:
+ text = self.format(elt[1][1])
+ else:
+ text = None
(qual,sep,tgt) = arg.partition(':')
if tgt != '':
ns = self.wiki_ns_name(qual)
if ns:
if ns == 'NS_IMAGE':
if not self.references:
- return None
+ return ""
text = "[%s: %s]" % (qual, text if text else arg)
tgt = self.image_base + '/' + \
urllib.quote(tgt) + \
@@ -94,41 +98,9 @@ class TextWikiMarkup (WikiMarkup):
return arg
else:
return text
-
- def str_link(self, tok, env):
- return self.link(tok, env, False)
-
- def str_tmpl(self, tok, env):
- return self.link(tok, env, True)
-
- def str_ref(self, tok, env):
- return self.xref(self.fmtok(tok[2], env), self.fmtok(tok[1], env))
-
- def str_it(self, tok, env):
- if self.markup:
- return "_" + self.fmtok(tok[1], env) + "_"
- return self.fmtok(tok[1], env);
-
- def str_bold(self, tok, env):
- if self.markup:
- return self.fmtok(tok[1], env).upper()
- return self.fmtok(tok[1], env);
-
- def str_hdr(self, tok, env):
- level = tok[1]
- return "\n\n" + ("*" * level) + " " + self.fmtok(tok[2], env) + "\n\n"
-
- def str_bar(self, tok, env):
- w = self.width
- if w < 5:
- w = 5
- return "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
-
- def str_env(self, tok, env):
- self.num = 1
- return "\n" + self.fmtok(tok[3], tok)
def indent (self, lev, text):
+ print "T \"",text,"\""
w = self.width
self.width = w - lev
if text.find('\n') == -1:
@@ -136,34 +108,117 @@ class TextWikiMarkup (WikiMarkup):
else:
s = ""
for elt in text.split('\n'):
- s += (" " * lev) + elt
- if elt == '':
- s += "\n"
+ s += (" " * lev) + elt + '\n'
self.width = w
return s
+
+ def fmtpara(self, input):
+ output = ""
+ linebuf = ""
+ length = 0
+ for s in input.split():
+ wlen = len(s)
+ if linebuf.endswith("."):
+ wsc = 2
+ else:
+ wsc = 1
+ if length + wsc + wlen > self.width:
+ # FIXME: fill out linebuf
+ output += linebuf + '\n'
+ wsc = 0
+ length = 0
+ linebuf = ""
+ linebuf += " " * wsc + s
+ length += wsc + wlen
+ return output + linebuf
- def str_item(self, tok, env):
- t = env[1]
- lev = env[2]
- if lev > self.width - 4:
- lev = 1
- if t == self.INDENT:
- return self.indent(lev, self.fmtok(tok[1], env))
- elif t == self.ENVNUM:
- n = self.num
- self.num += 1
- return "" + self.indent(lev,
- "%d. %s" % (n, self.fmtok(tok[1], env)))
- elif t == self.ENVUNNUM:
- return "" + self.indent(lev,
- "- " + self.fmtok(tok[1], env))
+ def fmtelt(self, elt, indent=0):
+ if elt[0] == TEXT:
+ if isinstance(elt[1],list):
+ string = ""
+ for s in elt[1]:
+ if string:
+ if string.endswith("."):
+ string += " "
+ else:
+ string += " "
+ string += s.rstrip(" ")
+ else:
+ string = elt[1]
+ elif elt[0] == PARA:
+ string = "";
+ for x in elt[1]:
+ string += self.format(x)
+ string = self.fmtpara(string) + '\n\n'
+ elif elt[0] == IT:
+ string = ""
+ for x in elt[1]:
+ s = self.format(x)
+ if s:
+ string += " " + s.rstrip(" ")
+ string = "_" + string.lstrip(" ") + "_"
+ elif elt[0] == BOLD:
+ string = ""
+ for x in elt[1]:
+ s = self.format(x)
+ if s:
+ if string.endswith("."):
+ string += " "
+ else:
+ string += " "
+ string += s.rstrip(" ")
+ string = string.upper()
+ elif elt[0] == LINK:
+ string = self.fmtlink(elt, False)
+ elif elt[0] == TMPL:
+ string = '\n' + self.fmtlink(elt, True) + '\n'
+ elif elt[0] == BAR:
+ w = self.width
+ if w < 5:
+ w = 5
+ string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
+ elif elt[0] == HDR:
+ level = elt[1]
+ string = "\n" + ("*" * level) + " " + \
+ self.format(elt[2]).lstrip(" ") + "\n\n"
+ elif elt[0] == REF:
+ string = self.xref(self.format(elt[2]), elt[1])
+ elif elt[0] == ENV:
+ type = elt[1]
+ lev = elt[2]
+ if lev > self.width - 4:
+ lev = 1
+ string = "\n"
+ n = 1
+ for s in elt[3]:
+ x = self.format(s)
+# print "X",x
+ if type == ENVUNNUM:
+ string += self.indent(lev, "*" + x.lstrip(" ")) + '\n'
+ elif type == ENVNUM:
+ string += self.indent(lev, "%d. %s" % (n, x)) + '\n'
+ n += 1
+ elif elt[0] == IND:
+ string = (" " * elt[1]) + self.format(elt[2]) + '\n'
+ else:
+ string = str(elt)
+ return string
+
+ def format(self, elt, indent=0):
+ string = ""
+ if elt[0] == SEQ:
+ for x in elt[1]:
+ string += " " + self.format(x, indent)
+ else:
+ string += " " + self.fmtelt(elt, indent)
+ return string
- def str_para(self, tok, env):
- return "\n"
-
def __str__(self):
- return self.fmtok(self.tree, None)
+ str = ""
+ for elt in self.tree:
+ str += self.format(elt)
+ return str
class TextWiktionaryMarkup (TextWikiMarkup):
"""

Return to:

Send suggestions and report system problems to the System administrator.