summaryrefslogtreecommitdiffabout
authorSergey Poznyakoff <gray@gnu.org.ua>2015-07-06 14:01:23 (GMT)
committer Sergey Poznyakoff <gray@gnu.org.ua>2015-07-06 14:36:49 (GMT)
commitb74b1d5fe2326f56a2e37f57c38b929307c71282 (patch) (side-by-side diff)
treee6029ae08f00bc7affcd1d7aec75d1288f9184ea
parentf3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff)
downloadwit-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz
wit-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2
Handle <tags> and implicit preformatted blocks
Among <tags>, this commit handles <nowiki> and <code>. General tag handling mechanism is provided. * wikimarkup.py (otag, ctag, close_delim): New variables. (BaseWikiMarkup)<newline,nested>: New attributes. (otag, ctag, close_delim): New variables. (newline,nested>: New attributes. (input_tag): New abstract method. (tokread): Remove calls to dprint, now done by the callers. Handle xml-style tags. (getkn,ungetkn): Set newline. (inline_delims): Add '|' (parse_para): Decide whether it is going to be a PRE or PARA. Don't mix the two. Fix recovery in case of unmatched/incorrect inline constructs. (parse): eliminate initial PARA, if called as a nested instance. (WikiMarkup): Remove parse method. Rely on the parent class. * wiki2html.py (input_tag, str_tag, str_pre): New methods. (format): Handle PRE and TAG tokens * wiki2text.py: Similar changes. Needs some more work.
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--wiki2html.py30
-rw-r--r--wiki2text.py29
-rw-r--r--wikimarkup.py112
3 files changed, 139 insertions, 32 deletions
diff --git a/wiki2html.py b/wiki2html.py
index eee592d..061377b 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -172,12 +172,38 @@ class HtmlWikiMarkup (WikiMarkup):
self.envt[type]["hdr"])
return string
+ supported_tags = [ 'nowiki', 'code' ]
+ def input_tag(self, tag):
+ return tag['tag'] in self.supported_tags
+
+ def str_tag(self, elt):
+ if elt['tag'] == 'nowiki':
+ return '<pre>' + elt['content'] + '</pre>'
+ elif elt['tag'] == 'code':
+ kwdict = {
+ 'nested': self.nested + 1,
+ 'lang': self.lang,
+ 'text': elt['content'],
+ 'html_base': self.html_base,
+ 'image_base': self.image_base,
+ 'media_base': self.media_base }
+ markup = HtmlWiktionaryMarkup(**kwdict)
+ markup.debug_level = self.debug_level
+ markup.parse()
+ return '<pre><code>' + str(markup) + '</code></pre>' #FIXME
+
def str_para(self, elt):
string = "";
for x in elt['content']:
string += self.format(x)
return "<p>" + string + "</p>"
+ def str_pre(self, elt):
+ string = "";
+ for x in elt['content']:
+ string += self.format(x)
+ return '<pre>' + string + '</pre>'
+
def str_ind(self, elt):
return ("&nbsp;" * 2 * elt['level']) + self.format(elt['content'])
@@ -190,8 +216,12 @@ class HtmlWikiMarkup (WikiMarkup):
else:
string = elt['content']
return string
+ elif elt['type'] == 'TAG':
+ return self.str_tag(elt)
elif elt['type'] == 'PARA':
return self.str_para(elt)
+ elif elt['type'] == 'PRE':
+ return self.str_pre(elt)
elif elt['type'] == 'IT':
return self.str_it(elt)
elif elt['type'] == 'BOLD':
diff --git a/wiki2text.py b/wiki2text.py
index c94ae51..3084ee4 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -142,6 +142,26 @@ class TextWikiMarkup (WikiMarkup):
length += wsc + wlen
return output + linebuf
+ supported_tags = [ 'nowiki', 'code' ]
+ def input_tag(self, tag):
+ return tag['tag'] in self.supported_tags
+
+ def str_tag(self, elt):
+ if elt['tag'] == 'nowiki':
+ return elt['content']
+ elif elt['tag'] == 'code':
+ kwdict = {
+ 'nested': self.nested + 1,
+ 'lang': self.lang,
+ 'text': elt['content'],
+ 'html_base': self.html_base,
+ 'image_base': self.image_base,
+ 'media_base': self.media_base }
+ markup = TextWiktionaryMarkup(**kwdict)
+ markup.debug_level = self.debug_level
+ markup.parse()
+ return str(markup)
+
def format(self, elt):
if elt['type'] == 'TEXT':
if isinstance(elt['content'],list):
@@ -155,11 +175,18 @@ class TextWikiMarkup (WikiMarkup):
string += s
else:
string = elt['content']
+ elif elt['type'] == 'PRE':
+ string = ""
+ for x in elt['content']:
+ string += self.format(x)
+ string += '\n'
elif elt['type'] == 'PARA':
string = "";
for x in elt['content']:
string += self.format(x)
string = self.fmtpara(string) + '\n\n'
+ elif elt['type'] == 'TAG':
+ string = self.str_tag(elt)
elif elt['type'] == 'IT':
string = ""
for x in elt['content']:
@@ -214,7 +241,7 @@ class TextWikiMarkup (WikiMarkup):
string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x)))
n += 1
elif type == "defn":
- if s[1] == 0:
+ if s['subtype'] == 0:
string += self.indent(lev-1, x)
else:
string += self.indent(lev+3, x)
diff --git a/wikimarkup.py b/wikimarkup.py
index 09c48eb..636012e 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -23,6 +23,14 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup",
"envtypes" ]
delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
+otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+
+close_delim = {
+ '[': ']',
+ '[[': ']]',
+ '{{': '}}'
+}
# Environment types:
envtypes = { "*": [ "unnumbered", 0 ],
@@ -35,13 +43,18 @@ class BaseWikiMarkup:
toklist = None
tokind = 0
+ newline = 0
tree = None
+ nested = 0
debug_level = 0
def dprint(self, lev, fmt, *argv):
if self.debug_level >= lev:
print "[DEBUG]", fmt % argv
+
+ def input_tag(self, tag):
+ pass
def tokread(self):
line = None
@@ -55,12 +68,10 @@ class BaseWikiMarkup:
line = u''
if not line or line == "":
- self.dprint(100, "YIELD: NIL")
yield({ 'type': 'NIL' })
break
if line == '\n':
- self.dprint(100, "YIELD: NL")
yield({ 'type': 'NL', 'content': line })
line = None
continue
@@ -70,32 +81,52 @@ class BaseWikiMarkup:
if m:
if (pos < m.start(0)):
- self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)])
yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
pos = m.end(0)
if envtypes.has_key(m.group(0)[0]) and line[pos] == ":":
# FIXME?
- self.dprint(100, "YIELD: DELIM %s, True", m.group(0))
# FIXME: What's "extra"?
yield({ 'type': 'DELIM',
'content': m.group(0),
'extra': True })
pos += 1
else:
- self.dprint(100, "YIELD: DELIM %s", m.group(0))
yield({ 'type': 'DELIM',
'content': m.group(0) })
else:
+ m = otag.match(line)
+ if m:
+ t = { 'type': 'TAG',
+ 'tag': m.group('tag'),
+ 'args': m.group('args') }
+
+ if self.input_tag(t):
+ s = ''
+ if not m.group('closed'):
+ while 1:
+ try:
+ l = self.input()
+ m = ctag.match(l)
+ if m and m.group('tag') == t['tag']:
+ break
+ s += l
+ except StopIteration:
+ break
+ yield({ 'type': 'TAG',
+ 'tag': t['tag'],
+ 'args': t['args'],
+ 'content': s
+ })
+ line = None
+ continue
+
if line[-1] == '\n':
- self.dprint(100, "YIELD: TEXT %s", line[pos:-1])
if line[pos:-1] != '':
yield({ 'type': 'TEXT',
'content': line[pos:-1] })
- self.dprint(100, "YIELD: NL")
yield({ 'type': 'NL',
'content': '\n' })
else:
- self.dprint(100, "YIELD: TEXT %s", line[pos:])
yield({ 'type': 'TEXT',
'content': line[pos:] })
line = None
@@ -106,6 +137,7 @@ class BaseWikiMarkup:
def tokenize(self):
self.toklist = []
for tok in self.tokread():
+ self.dprint(100, "TOK: %s", tok)
self.toklist.append(tok)
# Determine and fix up the ordering of bold and italic markers
# This helps correctly parse inputs like:
@@ -133,13 +165,15 @@ class BaseWikiMarkup:
self.toklist[self.tokind] = val
def getkn(self):
+ self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
tok = self.toklist[self.tokind]
if tok['type'] != 'NIL':
self.tokind = self.tokind + 1
return tok
-
+
def ungetkn(self):
self.tokind = self.tokind - 1
+ self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
return self.toklist[self.tokind]
def parse_fontmod(self,delim,what):
@@ -248,7 +282,7 @@ class BaseWikiMarkup:
self.dprint(80, "LEAVE parse_ref= %s", ret)
return ret
- inline_delims = [ "''", "'''", "[", "[[", "{{" ]
+ inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ]
def is_inline_delim(self, tok):
return tok['type'] == 'DELIM' and tok['content'] in self.inline_delims
@@ -280,9 +314,19 @@ class BaseWikiMarkup:
self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
seq = []
textlist = []
+ tok = self.peektkn()
+ if re.match("^\s", tok['content']):
+ type = 'PRE'
+ rx = re.compile("^\S")
+ else:
+ type = 'PARA'
+ rx = re.compile("^\s")
while 1:
tok = self.getkn()
if tok['type'] == 'TEXT':
+ if self.newline and rx.match(tok['content']):
+ self.ungetkn()
+ break
textlist.append(tok['content'])
elif tok['type'] == 'NL':
tok = self.getkn()
@@ -304,8 +348,22 @@ class BaseWikiMarkup:
if x:
seq.append(x)
else:
- seq.append(tok)
- break
+ self.dprint(80, "ROLLBACK parse_para=%s", tok)
+ od = tok['content']
+ textlist.append(od)
+ if close_delim.has_key(od):
+ cd = close_delim[od]
+ lev = 0
+ for tok in self.toklist[self.tokind:]:
+ if tok['type'] == 'NIL':
+ break
+ elif tok['type'] == 'DELIM':
+ if tok['content'] == od:
+ lev += 1
+ elif tok['content'] == cd:
+ if lev == 0:
+ tok['type'] = 'TEXT'
+ break
else:
seq.append({ 'type': 'TEXT', 'content': tok['content'] })
# self.ungetkn()
@@ -313,7 +371,7 @@ class BaseWikiMarkup:
if textlist:
seq.append({ 'type': 'TEXT', 'content': textlist })
self.dprint(80, "LEAVE parse_para=%s", seq)
- return { 'type': 'PARA', 'content': seq }
+ return { 'type': type, 'content': seq }
def parse_header(self, delim):
self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
@@ -407,7 +465,7 @@ class BaseWikiMarkup:
x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() }
self.dprint(80, "LEAVE parse_indent=%s", x)
return x
-
+
def parse0(self):
tok = self.getkn()
toktype = tok['type']
@@ -434,7 +492,9 @@ class BaseWikiMarkup:
elif toktype == 'NL':
return { 'type': 'TEXT', 'content': '\n' }
# return self.parse0()
-
+ else:
+ return tok
+
def parse(self):
if not self.toklist:
self.tokenize()
@@ -446,6 +506,9 @@ class BaseWikiMarkup:
if subtree == None:
break
self.tree.append(subtree)
+ if self.nested:
+ if self.tree[0]['type'] == 'PARA':
+ self.tree[0]['type'] = 'SEQ'
self.dprint(70, "TREE: %s", self.tree)
def __str__(self):
@@ -495,6 +558,8 @@ class WikiMarkup (BaseWikiMarkup):
self.image_base = keywords[kw]
elif kw == 'media_base':
self.media_base = keywords[kw]
+ elif kw == 'nested':
+ self.nested = keywords[kw]
def __del__(self):
if self.file:
@@ -541,21 +606,6 @@ class WikiMarkup (BaseWikiMarkup):
return False
return True
- def parse(self):
- BaseWikiMarkup.parse(self)
- # # Remove everything before the first header
- # for i in range(0, len(self.tree)):
- # if self.tree[i][0] == HDR:
- # self.tree = self.tree[i:]
- # break
- # # Remove trailing links
- # for i in range(len(self.tree)-1, 0, -1):
- # if self.tree[i][0] == PARA \
- # and not self.is_empty_para(self.tree[i][1]):
- # self.tree = self.tree[0:i+1]
- # break
-
-
# ISO 639
langtab = {
"aa": "Afar", # Afar
@@ -572,7 +622,7 @@ class WikiMarkup (BaseWikiMarkup):
"as": "অসমীয়া", # Assamese
"ast": "Asturian",
"av": "Авар", # Avaric
- "ay": "Aymar", # Aymara
+ "ay": "Aymara", # Aymara
"az": "Azərbaycan" , # Azerbaijani
"ba": "Башҡорт", # Bashkir

Return to:

Send suggestions and report system problems to the System administrator.