summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2015-07-06 17:01:23 +0300
committerSergey Poznyakoff <gray@gnu.org.ua>2015-07-06 17:36:49 +0300
commitb74b1d5fe2326f56a2e37f57c38b929307c71282 (patch)
treee6029ae08f00bc7affcd1d7aec75d1288f9184ea
parentf3378aebac7e89000ff097ac51c49b62eb6e9f08 (diff)
downloadwikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.gz
wikitrans-b74b1d5fe2326f56a2e37f57c38b929307c71282.tar.bz2
Handle <tags> and implicit preformatted blocks
Among <tags>, this commit handles <nowiki> and <code>. General tag handling mechanism is provided. * wikimarkup.py (otag, ctag, close_delim): New variables. (BaseWikiMarkup)<newline,nested>: New attributes. (otag, ctag, close_delim): New variables. (newline,nested>: New attributes. (input_tag): New abstract method. (tokread): Remove calls to dprint, now done by the callers. Handle xml-style tags. (getkn,ungetkn): Set newline. (inline_delims): Add '|' (parse_para): Decide whether it is going to be a PRE or PARA. Don't mix the two. Fix recovery in case of unmatched/incorrect inline constructs. (parse): eliminate initial PARA, if called as a nested instance. (WikiMarkup): Remove parse method. Rely on the parent class. * wiki2html.py (input_tag, str_tag, str_pre): New methods. (format): Handle PRE and TAG tokens * wiki2text.py: Similar changes. Needs some more work.
-rw-r--r--wiki2html.py30
-rw-r--r--wiki2text.py29
-rw-r--r--wikimarkup.py112
3 files changed, 139 insertions, 32 deletions
diff --git a/wiki2html.py b/wiki2html.py
index eee592d..061377b 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -174,2 +174,22 @@ class HtmlWikiMarkup (WikiMarkup):
+ supported_tags = [ 'nowiki', 'code' ]
+ def input_tag(self, tag):
+ return tag['tag'] in self.supported_tags
+
+ def str_tag(self, elt):
+ if elt['tag'] == 'nowiki':
+ return '<pre>' + elt['content'] + '</pre>'
+ elif elt['tag'] == 'code':
+ kwdict = {
+ 'nested': self.nested + 1,
+ 'lang': self.lang,
+ 'text': elt['content'],
+ 'html_base': self.html_base,
+ 'image_base': self.image_base,
+ 'media_base': self.media_base }
+ markup = HtmlWiktionaryMarkup(**kwdict)
+ markup.debug_level = self.debug_level
+ markup.parse()
+ return '<pre><code>' + str(markup) + '</code></pre>' #FIXME
+
def str_para(self, elt):
@@ -180,2 +200,8 @@ class HtmlWikiMarkup (WikiMarkup):
+ def str_pre(self, elt):
+ string = "";
+ for x in elt['content']:
+ string += self.format(x)
+ return '<pre>' + string + '</pre>'
+
def str_ind(self, elt):
@@ -192,4 +218,8 @@ class HtmlWikiMarkup (WikiMarkup):
return string
+ elif elt['type'] == 'TAG':
+ return self.str_tag(elt)
elif elt['type'] == 'PARA':
return self.str_para(elt)
+ elif elt['type'] == 'PRE':
+ return self.str_pre(elt)
elif elt['type'] == 'IT':
diff --git a/wiki2text.py b/wiki2text.py
index c94ae51..3084ee4 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -144,2 +144,22 @@ class TextWikiMarkup (WikiMarkup):
+ supported_tags = [ 'nowiki', 'code' ]
+ def input_tag(self, tag):
+ return tag['tag'] in self.supported_tags
+
+ def str_tag(self, elt):
+ if elt['tag'] == 'nowiki':
+ return elt['content']
+ elif elt['tag'] == 'code':
+ kwdict = {
+ 'nested': self.nested + 1,
+ 'lang': self.lang,
+ 'text': elt['content'],
+ 'html_base': self.html_base,
+ 'image_base': self.image_base,
+ 'media_base': self.media_base }
+ markup = TextWiktionaryMarkup(**kwdict)
+ markup.debug_level = self.debug_level
+ markup.parse()
+ return str(markup)
+
def format(self, elt):
@@ -157,2 +177,7 @@ class TextWikiMarkup (WikiMarkup):
string = elt['content']
+ elif elt['type'] == 'PRE':
+ string = ""
+ for x in elt['content']:
+ string += self.format(x)
+ string += '\n'
elif elt['type'] == 'PARA':
@@ -162,2 +187,4 @@ class TextWikiMarkup (WikiMarkup):
string = self.fmtpara(string) + '\n\n'
+ elif elt['type'] == 'TAG':
+ string = self.str_tag(elt)
elif elt['type'] == 'IT':
@@ -216,3 +243,3 @@ class TextWikiMarkup (WikiMarkup):
elif type == "defn":
- if s[1] == 0:
+ if s['subtype'] == 0:
string += self.indent(lev-1, x)
diff --git a/wikimarkup.py b/wikimarkup.py
index 09c48eb..636012e 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -25,2 +25,10 @@ __all__ = [ "BaseWikiMarkup", "WikiMarkup",
delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)")
+otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
+ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>")
+
+close_delim = {
+ '[': ']',
+ '[[': ']]',
+ '{{': '}}'
+}
@@ -37,4 +45,6 @@ class BaseWikiMarkup:
tokind = 0
+ newline = 0
tree = None
+ nested = 0
debug_level = 0
@@ -44,2 +54,5 @@ class BaseWikiMarkup:
print "[DEBUG]", fmt % argv
+
+ def input_tag(self, tag):
+ pass
@@ -57,3 +70,2 @@ class BaseWikiMarkup:
if not line or line == "":
- self.dprint(100, "YIELD: NIL")
yield({ 'type': 'NIL' })
@@ -62,3 +74,2 @@ class BaseWikiMarkup:
if line == '\n':
- self.dprint(100, "YIELD: NL")
yield({ 'type': 'NL', 'content': line })
@@ -72,3 +83,2 @@ class BaseWikiMarkup:
if (pos < m.start(0)):
- self.dprint(100, "YIELD: TEXT %s", line[pos:m.start(0)])
yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
@@ -77,3 +87,2 @@ class BaseWikiMarkup:
# FIXME?
- self.dprint(100, "YIELD: DELIM %s, True", m.group(0))
# FIXME: What's "extra"?
@@ -84,3 +93,2 @@ class BaseWikiMarkup:
else:
- self.dprint(100, "YIELD: DELIM %s", m.group(0))
yield({ 'type': 'DELIM',
@@ -88,4 +96,29 @@ class BaseWikiMarkup:
else:
+ m = otag.match(line)
+ if m:
+ t = { 'type': 'TAG',
+ 'tag': m.group('tag'),
+ 'args': m.group('args') }
+
+ if self.input_tag(t):
+ s = ''
+ if not m.group('closed'):
+ while 1:
+ try:
+ l = self.input()
+ m = ctag.match(l)
+ if m and m.group('tag') == t['tag']:
+ break
+ s += l
+ except StopIteration:
+ break
+ yield({ 'type': 'TAG',
+ 'tag': t['tag'],
+ 'args': t['args'],
+ 'content': s
+ })
+ line = None
+ continue
+
if line[-1] == '\n':
- self.dprint(100, "YIELD: TEXT %s", line[pos:-1])
if line[pos:-1] != '':
@@ -93,3 +126,2 @@ class BaseWikiMarkup:
'content': line[pos:-1] })
- self.dprint(100, "YIELD: NL")
yield({ 'type': 'NL',
@@ -97,3 +129,2 @@ class BaseWikiMarkup:
else:
- self.dprint(100, "YIELD: TEXT %s", line[pos:])
yield({ 'type': 'TEXT',
@@ -108,2 +139,3 @@ class BaseWikiMarkup:
for tok in self.tokread():
+ self.dprint(100, "TOK: %s", tok)
self.toklist.append(tok)
@@ -135,2 +167,3 @@ class BaseWikiMarkup:
def getkn(self):
+ self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
tok = self.toklist[self.tokind]
@@ -139,5 +172,6 @@ class BaseWikiMarkup:
return tok
-
+
def ungetkn(self):
self.tokind = self.tokind - 1
+ self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
return self.toklist[self.tokind]
@@ -250,3 +284,3 @@ class BaseWikiMarkup:
- inline_delims = [ "''", "'''", "[", "[[", "{{" ]
+ inline_delims = [ "''", "'''", "[", "[[", "{{", "|" ]
@@ -282,2 +316,9 @@ class BaseWikiMarkup:
textlist = []
+ tok = self.peektkn()
+ if re.match("^\s", tok['content']):
+ type = 'PRE'
+ rx = re.compile("^\S")
+ else:
+ type = 'PARA'
+ rx = re.compile("^\s")
while 1:
@@ -285,2 +326,5 @@ class BaseWikiMarkup:
if tok['type'] == 'TEXT':
+ if self.newline and rx.match(tok['content']):
+ self.ungetkn()
+ break
textlist.append(tok['content'])
@@ -306,4 +350,18 @@ class BaseWikiMarkup:
else:
- seq.append(tok)
- break
+ self.dprint(80, "ROLLBACK parse_para=%s", tok)
+ od = tok['content']
+ textlist.append(od)
+ if close_delim.has_key(od):
+ cd = close_delim[od]
+ lev = 0
+ for tok in self.toklist[self.tokind:]:
+ if tok['type'] == 'NIL':
+ break
+ elif tok['type'] == 'DELIM':
+ if tok['content'] == od:
+ lev += 1
+ elif tok['content'] == cd:
+ if lev == 0:
+ tok['type'] = 'TEXT'
+ break
else:
@@ -315,3 +373,3 @@ class BaseWikiMarkup:
self.dprint(80, "LEAVE parse_para=%s", seq)
- return { 'type': 'PARA', 'content': seq }
+ return { 'type': type, 'content': seq }
@@ -409,3 +467,3 @@ class BaseWikiMarkup:
return x
-
+
def parse0(self):
@@ -436,3 +494,5 @@ class BaseWikiMarkup:
# return self.parse0()
-
+ else:
+ return tok
+
def parse(self):
@@ -448,2 +508,5 @@ class BaseWikiMarkup:
self.tree.append(subtree)
+ if self.nested:
+ if self.tree[0]['type'] == 'PARA':
+ self.tree[0]['type'] = 'SEQ'
self.dprint(70, "TREE: %s", self.tree)
@@ -497,2 +560,4 @@ class WikiMarkup (BaseWikiMarkup):
self.media_base = keywords[kw]
+ elif kw == 'nested':
+ self.nested = keywords[kw]
@@ -543,17 +608,2 @@ class WikiMarkup (BaseWikiMarkup):
- def parse(self):
- BaseWikiMarkup.parse(self)
- # # Remove everything before the first header
- # for i in range(0, len(self.tree)):
- # if self.tree[i][0] == HDR:
- # self.tree = self.tree[i:]
- # break
- # # Remove trailing links
- # for i in range(len(self.tree)-1, 0, -1):
- # if self.tree[i][0] == PARA \
- # and not self.is_empty_para(self.tree[i][1]):
- # self.tree = self.tree[0:i+1]
- # break
-
-
# ISO 639
@@ -574,3 +624,3 @@ class WikiMarkup (BaseWikiMarkup):
"av": "Авар", # Avaric
- "ay": "Aymar", # Aymara
+ "ay": "Aymara", # Aymara
"az": "Azərbaycan" , # Azerbaijani

Return to:

Send suggestions and report system problems to the System administrator.