aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org>2018-08-01 17:18:07 +0300
committerSergey Poznyakoff <gray@gnu.org>2018-08-01 17:22:24 +0300
commitd8a4e3719e300869759393e9df7dd473f15be781 (patch)
tree7781fd56761a5c33befb34ca5eaf16ad9e9970ae
parenta85089f4495839590c791c02c833cd0f9b49733e (diff)
downloadwit-d8a4e3719e300869759393e9df7dd473f15be781.tar.gz
wit-d8a4e3719e300869759393e9df7dd473f15be781.tar.bz2
Fix wiki2text
* WikiTrans/wiki2html.py (HtmlTextNode): Escape html entities * WikiTrans/wiki2text.py: Remove the format method. * testdata/headings.html: Fix. * testdata/nowiki-tag.html: Fix. * tests/test.py: Rename to tests/test-html.py * tests/test-text.py: New file. * testdata/bold.text: New file. * testdata/boldit1.text: New file. * testdata/boldit2.text: New file. * testdata/boldit3.text: New file. * testdata/colon.text: New file. * testdata/deflist.text: New file. * testdata/headings.text: New file. * testdata/hz.text: New file. * testdata/it.text: New file. * testdata/itbold1.text: New file. * testdata/itbold2.text: New file. * testdata/itbold3.text: New file. * testdata/nowiki-ind.text: New file. * testdata/nowiki-tag.text: New file. * testdata/nowiki.text: New file. * testdata/numlist.text: New file. * testdata/para.text: New file. * testdata/unlist.text: New file.
-rw-r--r--WikiTrans/wiki2html.py9
-rw-r--r--WikiTrans/wiki2text.py144
-rw-r--r--testdata/bold.text3
-rw-r--r--testdata/boldit1.text3
-rw-r--r--testdata/boldit2.text3
-rw-r--r--testdata/boldit3.text3
-rw-r--r--testdata/colon.text8
-rw-r--r--testdata/deflist.text7
-rw-r--r--testdata/headings.html2
-rw-r--r--testdata/headings.text20
-rw-r--r--testdata/hz.text8
-rw-r--r--testdata/it.text3
-rw-r--r--testdata/itbold1.text3
-rw-r--r--testdata/itbold2.text3
-rw-r--r--testdata/itbold3.text3
-rw-r--r--testdata/nowiki-ind.text7
-rw-r--r--testdata/nowiki-tag.html3
-rw-r--r--testdata/nowiki-tag.text3
-rw-r--r--testdata/nowiki.text3
-rw-r--r--testdata/numlist.text9
-rw-r--r--testdata/para.text6
-rw-r--r--testdata/unlist.text12
-rw-r--r--tests/test-html.py (renamed from tests/test.py)0
-rw-r--r--tests/test-text.py123
24 files changed, 266 insertions, 122 deletions
diff --git a/WikiTrans/wiki2html.py b/WikiTrans/wiki2html.py
index 67cb9c3..d2af746 100644
--- a/WikiTrans/wiki2html.py
+++ b/WikiTrans/wiki2html.py
@@ -25,6 +25,11 @@ try:
except ImportError:
from urllib.parse import quote as url_quote
+try:
+ from html import escape as html_escape
+except ImportError:
+ from cgi import escape as html_escape
+
__all__ = [ "HtmlWikiMarkup", "HtmlWiktionaryMarkup" ]
class HtmlSeqNode(WikiSeqNode):
@@ -100,9 +105,9 @@ class HtmlTextNode(HtmlSeqNode):
if isinstance(self.content,list):
s = ''.join(self.content)
else:
- s = self.content
+ s = html_escape(self.content, quote=True)
return s
-
+
class HtmlHdrNode(WikiHdrNode):
def format(self):
level = self.level
diff --git a/WikiTrans/wiki2text.py b/WikiTrans/wiki2text.py
index cd4937e..004ae4a 100644
--- a/WikiTrans/wiki2text.py
+++ b/WikiTrans/wiki2text.py
@@ -19,7 +19,10 @@ from WikiTrans.wikitoken import *
from WikiTrans.wikimarkup import *
from WikiTrans.wikins import wiki_ns_re, wiki_ns
import re
-import urllib
+try:
+ from urllib import quote as url_quote
+except ImportError:
+ from urllib.parse import quote as url_quote
class TextSeqNode(WikiSeqNode):
def format(self):
@@ -32,9 +35,9 @@ class TextSeqNode(WikiSeqNode):
class TextTextNode(WikiTextNode):
def format(self):
- if isinstance(elt.content,list):
+ if isinstance(self.content,list):
string = ""
- for s in elt.content:
+ for s in self.content:
if string:
if string.endswith("."):
string += " "
@@ -42,13 +45,13 @@ class TextTextNode(WikiTextNode):
string += " "
string += s
else:
- string = elt.content
+ string = self.content
return string
class TextPreNode(WikiSeqNode):
def format(self):
string = ""
- for x in elt.content:
+ for x in self.content:
string += x.format()
string += '\n'
return string
@@ -56,7 +59,7 @@ class TextPreNode(WikiSeqNode):
class TextParaNode(WikiSeqNode):
def format(self):
string = ""
- for x in elt.content:
+ for x in self.content:
string += x.format()
string = self.parser.fmtpara(string) + '\n\n'
return string
@@ -64,7 +67,7 @@ class TextParaNode(WikiSeqNode):
class TextItNode(WikiSeqNode):
def format(self):
string = ""
- for x in elt.content:
+ for x in self.content:
s = x.format()
if s:
string += " " + s
@@ -73,7 +76,7 @@ class TextItNode(WikiSeqNode):
class TextBoldNode(WikiSeqNode):
def format(self):
string = ""
- for x in elt.content:
+ for x in self.content:
if string.endswith("."):
string += " "
else:
@@ -105,8 +108,8 @@ class TextLinkNode(WikiSeqNode):
return ""
text = "[%s: %s]" % (qual, text if text else arg)
tgt = self.image_base + '/' + \
- urllib.quote(tgt) + \
- '/250px-' + urllib.quote(tgt)
+ url_quote(tgt) + \
+ '/250px-' + url_quote(tgt)
elif ns == 'NS_MEDIA':
text = "[%s]" % (qual)
else:
@@ -139,7 +142,7 @@ class TextBarNode(WikiNode):
class TextHdrNode(WikiHdrNode):
def format(self):
return "\n" + ("*" * self.level) + " " + \
- elt.content.format().lstrip(" ") + "\n\n"
+ self.content.format().lstrip(" ") + "\n\n"
class TextRefNode(WikiRefNode):
def format(self):
@@ -162,9 +165,9 @@ class TextEnvNode(WikiEnvNode):
string += "\n"
x = s.content.format()
if type == "unnumbered":
- string += self.parser.fmtpara(self.parser.indent(lev, "- " + x.lstrip(" ")))
+ string += self.parser.indent(lev, "- " + x.lstrip(" "))
elif type == "numbered":
- string += self.parser.fmtpara(self.parser.indent(lev, "%d. %s" % (n, x)))
+ string += self.parser.indent(lev, "%d. %s" % (n, x))
n += 1
elif type == "defn":
if s.subtype == 0:
@@ -175,7 +178,7 @@ class TextEnvNode(WikiEnvNode):
if not string.endswith("\n"):
string += "\n"
- return string
+ return string
class TextIndNode(WikiIndNode):
def format(self):
@@ -185,13 +188,13 @@ class TextTagNode(WikiTagNode):
def format(self):
if self.tag == 'code':
self.parser.nested += 1
- s = elt.content.format()
+ s = self.content.format()
self.parser.nested -= 1
else:
s = '<' + self.tag
if self.args:
s += ' ' + str(self.args)
- s += '>' + elt.content.format() + '</' + self.tag + '>'
+ s += '>' + self.content.format() + '</' + self.tag + '>'
return s
@@ -246,7 +249,7 @@ class TextWikiMarkup (WikiMarkup):
def mktgt(self, tgt, lang = None):
if not lang:
lang = self.lang
- return self.html_base % { 'lang' : lang } + urllib.quote(tgt)
+ return self.html_base % { 'lang' : lang } + url_quote(tgt)
def fmtlink(self, elt, istmpl):
arg = self.format(elt.content[0])
@@ -271,8 +274,8 @@ class TextWikiMarkup (WikiMarkup):
return ""
text = "[%s: %s]" % (qual, text if text else arg)
tgt = self.image_base + '/' + \
- urllib.quote(tgt) + \
- '/250px-' + urllib.quote(tgt)
+ url_quote(tgt) + \
+ '/250px-' + url_quote(tgt)
elif ns == 'NS_MEDIA':
text = "[%s]" % (qual)
else:
@@ -311,7 +314,9 @@ class TextWikiMarkup (WikiMarkup):
length = 0
for s in input.split():
wlen = len(s)
- if linebuf.endswith("."):
+ if len(linebuf) == 0:
+ wsc = 0
+ elif linebuf.endswith("."):
wsc = 2
else:
wsc = 1
@@ -338,107 +343,10 @@ class TextWikiMarkup (WikiMarkup):
s += '>' + self.format(elt.content) + '</' + elt.tag + '>'
return s
- def format(self, elt):
- if elt.type == 'TEXT':
- if isinstance(elt.content,list):
- string = ""
- for s in elt.content:
- if string:
- if string.endswith("."):
- string += " "
- else:
- string += " "
- string += s
- else:
- string = elt.content
- elif elt.type == 'PRE':
- string = ""
- for x in elt.content:
- string += self.format(x)
- string += '\n'
- elif elt.type == 'PARA':
- string = "";
- for x in elt.content:
- string += self.format(x)
- string = self.fmtpara(string) + '\n\n'
- elif elt.type == 'TAG':
- string = self.str_tag(elt)
- elif elt.type == 'IT':
- string = ""
- for x in elt.content:
- s = self.format(x)
- if s:
- string += " " + s
- string = "_" + string.lstrip(" ") + "_"
- elif elt.type == 'BOLD':
- string = ""
- for x in elt.content:
- s = self.format(x)
- if s:
- if string.endswith("."):
- string += " "
- else:
- string += " "
- string += s
- string = string.upper()
- elif elt.type == 'LINK':
- string = self.fmtlink(elt, False)
- elif elt.type == 'TMPL':
- s = self.fmtlink(elt, True)
- if s:
- string = '[' + s + ']'
- else:
- string = s
- elif elt.type == 'BAR':
- w = self.width
- if w < 5:
- w = 5
- string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n"
- elif elt.type == 'HDR':
- string = "\n" + ("*" * elt.level) + " " + \
- self.format(elt.content).lstrip(" ") + "\n\n"
- elif elt.type == 'REF':
- string = self.xref(self.format(elt.content), elt.ref)
- elif elt.type == 'ENV':
- type = elt.envtype
- lev = elt.level
- if lev > self.width - 4:
- lev = 1
- string = ""
- n = 1
- for s in elt.content:
- if not string.endswith("\n"):
- string += "\n"
- x = self.format(s.content)
- if type == "unnumbered":
- string += self.fmtpara(self.indent(lev, "- " + x.lstrip(" ")))
- elif type == "numbered":
- string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x)))
- n += 1
- elif type == "defn":
- if s.subtype == 0:
- string += self.indent(lev-1, x)
- else:
- string += self.indent(lev+3, x)
-
- if not string.endswith("\n"):
- string += "\n"
- elif elt.type == 'IND':
- string = (" " * elt.level) + self.format(elt.content) + '\n'
- elif elt.type == 'SEQ':
- string = ""
- for x in elt.content:
- if len(string) > 1 and not string[-1].isspace():
- string += ' '
- string += self.format(x)
- else:
- string = str(elt)
- return string
-
def __str__(self):
str = ""
for elt in self.tree:
- str += self.format(elt)
+ str += elt.format()
return str
class TextWiktionaryMarkup (TextWikiMarkup):
diff --git a/testdata/bold.text b/testdata/bold.text
new file mode 100644
index 0000000..30042f6
--- /dev/null
+++ b/testdata/bold.text
@@ -0,0 +1,3 @@
+now is the time for ALL GOOD men to come to
+
+
diff --git a/testdata/boldit1.text b/testdata/boldit1.text
new file mode 100644
index 0000000..adad116
--- /dev/null
+++ b/testdata/boldit1.text
@@ -0,0 +1,3 @@
+A B _C_ D
+
+
diff --git a/testdata/boldit2.text b/testdata/boldit2.text
new file mode 100644
index 0000000..2ac53de
--- /dev/null
+++ b/testdata/boldit2.text
@@ -0,0 +1,3 @@
+_A B_ C D
+
+
diff --git a/testdata/boldit3.text b/testdata/boldit3.text
new file mode 100644
index 0000000..27ebe3f
--- /dev/null
+++ b/testdata/boldit3.text
@@ -0,0 +1,3 @@
+A B _C D_
+
+
diff --git a/testdata/colon.text b/testdata/colon.text
new file mode 100644
index 0000000..b315cec
--- /dev/null
+++ b/testdata/colon.text
@@ -0,0 +1,8 @@
+ A colon (:) indents a line or paragraph.
+A newline starts a new paragraph. Should only be used on talk pages. For
+articles, you probably want the blockquote tag.
+
+ We use 1 colon to indent once.
+ We use 2 colons to indent twice.
+ 3 colons to indent 3 times, and so on.
+
diff --git a/testdata/deflist.text b/testdata/deflist.text
new file mode 100644
index 0000000..b04fa95
--- /dev/null
+++ b/testdata/deflist.text
@@ -0,0 +1,7 @@
+
+item 1
+ definition 1
+item 2
+ definition 2-1
+ definition 2-2
+
diff --git a/testdata/headings.html b/testdata/headings.html
index 445c821..9a8eaf0 100644
--- a/testdata/headings.html
+++ b/testdata/headings.html
@@ -5,7 +5,7 @@ sections. The Wiki software can automatically
generate a <a href="http://pl.wiktionary.org/wiki/table%20of%20contents">table of contents</a> from them.</p>
<h3>Subsection</h3>
-<p>Using more "equals" (=) signs creates a subsection.</p>
+<p>Using more &quot;equals&quot; (=) signs creates a subsection.</p>
<h4>A smaller subsection</h4>
<p>Don't skip levels,
diff --git a/testdata/headings.text b/testdata/headings.text
new file mode 100644
index 0000000..e6c30ea
--- /dev/null
+++ b/testdata/headings.text
@@ -0,0 +1,20 @@
+
+** Section headings
+
+_Headings_ organize your writing into sections. The Wiki software can
+automatically generate a table of contents from them.
+
+
+*** Subsection
+
+Using more "equals" (=) signs creates a subsection.
+
+
+**** A smaller subsection
+
+Don't skip levels, like from two to four equals signs.
+
+Start with 2 equals signs not 1 because 1 creates H1 tags which should be
+reserved for page title.
+
+
diff --git a/testdata/hz.text b/testdata/hz.text
new file mode 100644
index 0000000..bdc1764
--- /dev/null
+++ b/testdata/hz.text
@@ -0,0 +1,8 @@
+You can make horizontal dividing lines (----) to separate text.
+
+
+ -------------------------------------------------------------------------
+But you should usually use sections instead, so that they go in the table of
+contents.
+
+
diff --git a/testdata/it.text b/testdata/it.text
new file mode 100644
index 0000000..3de3b6b
--- /dev/null
+++ b/testdata/it.text
@@ -0,0 +1,3 @@
+now is the time for _all good_ men to come to
+
+
diff --git a/testdata/itbold1.text b/testdata/itbold1.text
new file mode 100644
index 0000000..ef1dee6
--- /dev/null
+++ b/testdata/itbold1.text
@@ -0,0 +1,3 @@
+_a b C d_
+
+
diff --git a/testdata/itbold2.text b/testdata/itbold2.text
new file mode 100644
index 0000000..f4ec078
--- /dev/null
+++ b/testdata/itbold2.text
@@ -0,0 +1,3 @@
+_A B c d_
+
+
diff --git a/testdata/itbold3.text b/testdata/itbold3.text
new file mode 100644
index 0000000..619c8af
--- /dev/null
+++ b/testdata/itbold3.text
@@ -0,0 +1,3 @@
+_a b C D_
+
+
diff --git a/testdata/nowiki-ind.text b/testdata/nowiki-ind.text
new file mode 100644
index 0000000..9d7c468
--- /dev/null
+++ b/testdata/nowiki-ind.text
@@ -0,0 +1,7 @@
+Para
+
+a b c
+
+para
+
+
diff --git a/testdata/nowiki-tag.html b/testdata/nowiki-tag.html
index 58b96fc..47edb4a 100644
--- a/testdata/nowiki-tag.html
+++ b/testdata/nowiki-tag.html
@@ -1 +1,2 @@
-<p>A <tag></p> \ No newline at end of file
+<p>A &lt;tag&gt;</p>
+
diff --git a/testdata/nowiki-tag.text b/testdata/nowiki-tag.text
new file mode 100644
index 0000000..1a9bf08
--- /dev/null
+++ b/testdata/nowiki-tag.text
@@ -0,0 +1,3 @@
+A <tag>
+
+
diff --git a/testdata/nowiki.text b/testdata/nowiki.text
new file mode 100644
index 0000000..7a6814f
--- /dev/null
+++ b/testdata/nowiki.text
@@ -0,0 +1,3 @@
+#:version=1.0_rest_ of line
+
+
diff --git a/testdata/numlist.text b/testdata/numlist.text
new file mode 100644
index 0000000..869fd9b
--- /dev/null
+++ b/testdata/numlist.text
@@ -0,0 +1,9 @@
+
+ 1. _Numbered lists_ are:
+ 1. Very organized
+ 2. Easy to follow
+A newline marks the end of the list.
+
+
+ 1. New numbering starts with 1.
+
diff --git a/testdata/para.text b/testdata/para.text
new file mode 100644
index 0000000..65ec74a
--- /dev/null
+++ b/testdata/para.text
@@ -0,0 +1,6 @@
+First paragraph consists of two sentences. Each sentence occupies a line.
+
+Second paragraph consists of two sentences as well. Each of them, again,
+occupies its own line.
+
+
diff --git a/testdata/unlist.text b/testdata/unlist.text
new file mode 100644
index 0000000..5745cb0
--- /dev/null
+++ b/testdata/unlist.text
@@ -0,0 +1,12 @@
+
+ - _Unordered lists_ are easy to do:
+ - Start every line with a star.
+ - More stars indicate a deeper level.
+ Previous item continues.
+ - A newline
+ - in a list
+marks the end of the list.
+
+
+ - Of course you can start again.
+
diff --git a/tests/test.py b/tests/test-html.py
index c54a717..c54a717 100644
--- a/tests/test.py
+++ b/tests/test-html.py
diff --git a/tests/test-text.py b/tests/test-text.py
new file mode 100644
index 0000000..09299e9
--- /dev/null
+++ b/tests/test-text.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2008-2018 Sergey Poznyakoff
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import print_function
+import unittest
+from WikiTrans.wiki2text import TextWikiMarkup, TextWiktionaryMarkup
+
+class TestMarkupParserBasic (unittest.TestCase):
+
+ def test_colon(self):
+ self.assertTrue(self.__test('colon'))
+ pass
+
+ def test_headings(self):
+ self.assertTrue(self.__test('headings'))
+ pass
+
+ def test_hz(self):
+ self.assertTrue(self.__test('hz'))
+ pass
+
+ def test_numlist(self):
+ self.assertTrue(self.__test('numlist'))
+ pass
+
+ def test_unlist(self):
+ self.assertTrue(self.__test('unlist'))
+ pass
+
+ def test_deflist(self):
+ self.assertTrue(self.__test('deflist'))
+ pass
+
+ def test_para(self):
+ self.assertTrue(self.__test('para'))
+ pass
+
+ def test_it(self):
+ self.assertTrue(self.__test('it'))
+ pass
+
+ def test_bold(self):
+ self.assertTrue(self.__test('bold'))
+ pass
+
+ def test_boldit1(self):
+ self.assertTrue(self.__test('boldit1'))
+ pass
+
+ def test_itbold1(self):
+ self.assertTrue(self.__test('itbold1'))
+ pass
+
+ def test_boldit2(self):
+ self.assertTrue(self.__test('boldit2'))
+ pass
+
+ def test_itbold2(self):
+ self.assertTrue(self.__test('itbold2'))
+ pass
+
+ def test_boldit3(self):
+ self.assertTrue(self.__test('boldit3'))
+ pass
+
+ def test_itbold3(self):
+ self.assertTrue(self.__test('itbold3'))
+ pass
+
+ def test_nowiki(self):
+ self.assertTrue(self.__test('nowiki'))
+ pass
+
+ def test_nowiki_tag(self):
+ self.assertTrue(self.__test('nowiki-tag'))
+ pass
+
+ def test_nowiki_ind(self):
+ self.assertTrue(self.__test('nowiki-ind'))
+ pass
+
+ # def test_door(self):
+ # self.assertTrue(self.__test('door'))
+ # pass
+
+ # def test_drzwi(self):
+ # self.assertTrue(self.__test('drzwi'))
+ # pass
+
+ def __test(self, filename):
+ name_in = 'testdata/' + filename + '.wiki'
+ name_out = 'testdata/' + filename + '.text'
+ fh = open(name_out)
+ buf = ''.join(fh.readlines()).strip()
+ fh.close()
+ hwm = TextWiktionaryMarkup(filename=name_in, lang="pl")
+ hwm.parse()
+
+ if str(hwm).strip() == buf:
+ return True
+
+ # fail
+ print("\n>>>%s<<<" % buf)
+ print(">>>%s<<<" % str(hwm).strip())
+ return False
+
+if __name__ == '__main__':
+ unittest.main()
+

Return to:

Send suggestions and report system problems to the System administrator.