diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2018-08-01 17:18:07 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2018-08-01 17:22:24 +0300 |
commit | d8a4e3719e300869759393e9df7dd473f15be781 (patch) | |
tree | 7781fd56761a5c33befb34ca5eaf16ad9e9970ae | |
parent | a85089f4495839590c791c02c833cd0f9b49733e (diff) | |
download | wit-d8a4e3719e300869759393e9df7dd473f15be781.tar.gz wit-d8a4e3719e300869759393e9df7dd473f15be781.tar.bz2 |
Fix wiki2text
* WikiTrans/wiki2html.py (HtmlTextNode): Escape html entities
* WikiTrans/wiki2text.py: Remove the format method.
* testdata/headings.html: Fix.
* testdata/nowiki-tag.html: Fix.
* tests/test.py: Rename to tests/test-html.py
* tests/test-text.py: New file.
* testdata/bold.text: New file.
* testdata/boldit1.text: New file.
* testdata/boldit2.text: New file.
* testdata/boldit3.text: New file.
* testdata/colon.text: New file.
* testdata/deflist.text: New file.
* testdata/headings.text: New file.
* testdata/hz.text: New file.
* testdata/it.text: New file.
* testdata/itbold1.text: New file.
* testdata/itbold2.text: New file.
* testdata/itbold3.text: New file.
* testdata/nowiki-ind.text: New file.
* testdata/nowiki-tag.text: New file.
* testdata/nowiki.text: New file.
* testdata/numlist.text: New file.
* testdata/para.text: New file.
* testdata/unlist.text: New file.
-rw-r--r-- | WikiTrans/wiki2html.py | 9 | ||||
-rw-r--r-- | WikiTrans/wiki2text.py | 144 | ||||
-rw-r--r-- | testdata/bold.text | 3 | ||||
-rw-r--r-- | testdata/boldit1.text | 3 | ||||
-rw-r--r-- | testdata/boldit2.text | 3 | ||||
-rw-r--r-- | testdata/boldit3.text | 3 | ||||
-rw-r--r-- | testdata/colon.text | 8 | ||||
-rw-r--r-- | testdata/deflist.text | 7 | ||||
-rw-r--r-- | testdata/headings.html | 2 | ||||
-rw-r--r-- | testdata/headings.text | 20 | ||||
-rw-r--r-- | testdata/hz.text | 8 | ||||
-rw-r--r-- | testdata/it.text | 3 | ||||
-rw-r--r-- | testdata/itbold1.text | 3 | ||||
-rw-r--r-- | testdata/itbold2.text | 3 | ||||
-rw-r--r-- | testdata/itbold3.text | 3 | ||||
-rw-r--r-- | testdata/nowiki-ind.text | 7 | ||||
-rw-r--r-- | testdata/nowiki-tag.html | 3 | ||||
-rw-r--r-- | testdata/nowiki-tag.text | 3 | ||||
-rw-r--r-- | testdata/nowiki.text | 3 | ||||
-rw-r--r-- | testdata/numlist.text | 9 | ||||
-rw-r--r-- | testdata/para.text | 6 | ||||
-rw-r--r-- | testdata/unlist.text | 12 | ||||
-rw-r--r-- | tests/test-html.py (renamed from tests/test.py) | 0 | ||||
-rw-r--r-- | tests/test-text.py | 123 |
24 files changed, 266 insertions, 122 deletions
diff --git a/WikiTrans/wiki2html.py b/WikiTrans/wiki2html.py index 67cb9c3..d2af746 100644 --- a/WikiTrans/wiki2html.py +++ b/WikiTrans/wiki2html.py @@ -25,6 +25,11 @@ try: except ImportError: from urllib.parse import quote as url_quote +try: + from html import escape as html_escape +except ImportError: + from cgi import escape as html_escape + __all__ = [ "HtmlWikiMarkup", "HtmlWiktionaryMarkup" ] class HtmlSeqNode(WikiSeqNode): @@ -100,9 +105,9 @@ class HtmlTextNode(HtmlSeqNode): if isinstance(self.content,list): s = ''.join(self.content) else: - s = self.content + s = html_escape(self.content, quote=True) return s - + class HtmlHdrNode(WikiHdrNode): def format(self): level = self.level diff --git a/WikiTrans/wiki2text.py b/WikiTrans/wiki2text.py index cd4937e..004ae4a 100644 --- a/WikiTrans/wiki2text.py +++ b/WikiTrans/wiki2text.py @@ -19,7 +19,10 @@ from WikiTrans.wikitoken import * from WikiTrans.wikimarkup import * from WikiTrans.wikins import wiki_ns_re, wiki_ns import re -import urllib +try: + from urllib import quote as url_quote +except ImportError: + from urllib.parse import quote as url_quote class TextSeqNode(WikiSeqNode): def format(self): @@ -32,9 +35,9 @@ class TextSeqNode(WikiSeqNode): class TextTextNode(WikiTextNode): def format(self): - if isinstance(elt.content,list): + if isinstance(self.content,list): string = "" - for s in elt.content: + for s in self.content: if string: if string.endswith("."): string += " " @@ -42,13 +45,13 @@ class TextTextNode(WikiTextNode): string += " " string += s else: - string = elt.content + string = self.content return string class TextPreNode(WikiSeqNode): def format(self): string = "" - for x in elt.content: + for x in self.content: string += x.format() string += '\n' return string @@ -56,7 +59,7 @@ class TextPreNode(WikiSeqNode): class TextParaNode(WikiSeqNode): def format(self): string = "" - for x in elt.content: + for x in self.content: string += x.format() string = self.parser.fmtpara(string) + '\n\n' return string @@ -64,7 +67,7 @@ class TextParaNode(WikiSeqNode): class TextItNode(WikiSeqNode): def format(self): string = "" - for x in elt.content: + for x in self.content: s = x.format() if s: string += " " + s @@ -73,7 +76,7 @@ class TextItNode(WikiSeqNode): class TextBoldNode(WikiSeqNode): def format(self): string = "" - for x in elt.content: + for x in self.content: if string.endswith("."): string += " " else: @@ -105,8 +108,8 @@ class TextLinkNode(WikiSeqNode): return "" text = "[%s: %s]" % (qual, text if text else arg) tgt = self.image_base + '/' + \ - urllib.quote(tgt) + \ - '/250px-' + urllib.quote(tgt) + url_quote(tgt) + \ + '/250px-' + url_quote(tgt) elif ns == 'NS_MEDIA': text = "[%s]" % (qual) else: @@ -139,7 +142,7 @@ class TextBarNode(WikiNode): class TextHdrNode(WikiHdrNode): def format(self): return "\n" + ("*" * self.level) + " " + \ - elt.content.format().lstrip(" ") + "\n\n" + self.content.format().lstrip(" ") + "\n\n" class TextRefNode(WikiRefNode): def format(self): @@ -162,9 +165,9 @@ class TextEnvNode(WikiEnvNode): string += "\n" x = s.content.format() if type == "unnumbered": - string += self.parser.fmtpara(self.parser.indent(lev, "- " + x.lstrip(" "))) + string += self.parser.indent(lev, "- " + x.lstrip(" ")) elif type == "numbered": - string += self.parser.fmtpara(self.parser.indent(lev, "%d. %s" % (n, x))) + string += self.parser.indent(lev, "%d. %s" % (n, x)) n += 1 elif type == "defn": if s.subtype == 0: @@ -175,7 +178,7 @@ class TextEnvNode(WikiEnvNode): if not string.endswith("\n"): string += "\n" - return string + return string class TextIndNode(WikiIndNode): def format(self): @@ -185,13 +188,13 @@ class TextTagNode(WikiTagNode): def format(self): if self.tag == 'code': self.parser.nested += 1 - s = elt.content.format() + s = self.content.format() self.parser.nested -= 1 else: s = '<' + self.tag if self.args: s += ' ' + str(self.args) - s += '>' + elt.content.format() + '</' + self.tag + '>' + s += '>' + self.content.format() + '</' + self.tag + '>' return s @@ -246,7 +249,7 @@ class TextWikiMarkup (WikiMarkup): def mktgt(self, tgt, lang = None): if not lang: lang = self.lang - return self.html_base % { 'lang' : lang } + urllib.quote(tgt) + return self.html_base % { 'lang' : lang } + url_quote(tgt) def fmtlink(self, elt, istmpl): arg = self.format(elt.content[0]) @@ -271,8 +274,8 @@ class TextWikiMarkup (WikiMarkup): return "" text = "[%s: %s]" % (qual, text if text else arg) tgt = self.image_base + '/' + \ - urllib.quote(tgt) + \ - '/250px-' + urllib.quote(tgt) + url_quote(tgt) + \ + '/250px-' + url_quote(tgt) elif ns == 'NS_MEDIA': text = "[%s]" % (qual) else: @@ -311,7 +314,9 @@ class TextWikiMarkup (WikiMarkup): length = 0 for s in input.split(): wlen = len(s) - if linebuf.endswith("."): + if len(linebuf) == 0: + wsc = 0 + elif linebuf.endswith("."): wsc = 2 else: wsc = 1 @@ -338,107 +343,10 @@ class TextWikiMarkup (WikiMarkup): s += '>' + self.format(elt.content) + '</' + elt.tag + '>' return s - def format(self, elt): - if elt.type == 'TEXT': - if isinstance(elt.content,list): - string = "" - for s in elt.content: - if string: - if string.endswith("."): - string += " " - else: - string += " " - string += s - else: - string = elt.content - elif elt.type == 'PRE': - string = "" - for x in elt.content: - string += self.format(x) - string += '\n' - elif elt.type == 'PARA': - string = ""; - for x in elt.content: - string += self.format(x) - string = self.fmtpara(string) + '\n\n' - elif elt.type == 'TAG': - string = self.str_tag(elt) - elif elt.type == 'IT': - string = "" - for x in elt.content: - s = self.format(x) - if s: - string += " " + s - string = "_" + string.lstrip(" ") + "_" - elif elt.type == 'BOLD': - string = "" - for x in elt.content: - s = self.format(x) - if s: - if string.endswith("."): - string += " " - else: - string += " " - string += s - string = string.upper() - elif elt.type == 'LINK': - string = self.fmtlink(elt, False) - elif elt.type == 'TMPL': - s = self.fmtlink(elt, True) - if s: - string = '[' + s + ']' - else: - string = s - elif elt.type == 'BAR': - w = self.width - if w < 5: - w = 5 - string = "\n" + ("-" * (w - 5)).center(w - 1) + "\n" - elif elt.type == 'HDR': - string = "\n" + ("*" * elt.level) + " " + \ - self.format(elt.content).lstrip(" ") + "\n\n" - elif elt.type == 'REF': - string = self.xref(self.format(elt.content), elt.ref) - elif elt.type == 'ENV': - type = elt.envtype - lev = elt.level - if lev > self.width - 4: - lev = 1 - string = "" - n = 1 - for s in elt.content: - if not string.endswith("\n"): - string += "\n" - x = self.format(s.content) - if type == "unnumbered": - string += self.fmtpara(self.indent(lev, "- " + x.lstrip(" "))) - elif type == "numbered": - string += self.fmtpara(self.indent(lev, "%d. %s" % (n, x))) - n += 1 - elif type == "defn": - if s.subtype == 0: - string += self.indent(lev-1, x) - else: - string += self.indent(lev+3, x) - - if not string.endswith("\n"): - string += "\n" - elif elt.type == 'IND': - string = (" " * elt.level) + self.format(elt.content) + '\n' - elif elt.type == 'SEQ': - string = "" - for x in elt.content: - if len(string) > 1 and not string[-1].isspace(): - string += ' ' - string += self.format(x) - else: - string = str(elt) - return string - def __str__(self): str = "" for elt in self.tree: - str += self.format(elt) + str += elt.format() return str class TextWiktionaryMarkup (TextWikiMarkup): diff --git a/testdata/bold.text b/testdata/bold.text new file mode 100644 index 0000000..30042f6 --- /dev/null +++ b/testdata/bold.text @@ -0,0 +1,3 @@ +now is the time for ALL GOOD men to come to + + diff --git a/testdata/boldit1.text b/testdata/boldit1.text new file mode 100644 index 0000000..adad116 --- /dev/null +++ b/testdata/boldit1.text @@ -0,0 +1,3 @@ +A B _C_ D + + diff --git a/testdata/boldit2.text b/testdata/boldit2.text new file mode 100644 index 0000000..2ac53de --- /dev/null +++ b/testdata/boldit2.text @@ -0,0 +1,3 @@ +_A B_ C D + + diff --git a/testdata/boldit3.text b/testdata/boldit3.text new file mode 100644 index 0000000..27ebe3f --- /dev/null +++ b/testdata/boldit3.text @@ -0,0 +1,3 @@ +A B _C D_ + + diff --git a/testdata/colon.text b/testdata/colon.text new file mode 100644 index 0000000..b315cec --- /dev/null +++ b/testdata/colon.text @@ -0,0 +1,8 @@ + A colon (:) indents a line or paragraph. +A newline starts a new paragraph. Should only be used on talk pages. For +articles, you probably want the blockquote tag. + + We use 1 colon to indent once. + We use 2 colons to indent twice. + 3 colons to indent 3 times, and so on. + diff --git a/testdata/deflist.text b/testdata/deflist.text new file mode 100644 index 0000000..b04fa95 --- /dev/null +++ b/testdata/deflist.text @@ -0,0 +1,7 @@ + +item 1 + definition 1 +item 2 + definition 2-1 + definition 2-2 + diff --git a/testdata/headings.html b/testdata/headings.html index 445c821..9a8eaf0 100644 --- a/testdata/headings.html +++ b/testdata/headings.html @@ -5,7 +5,7 @@ sections. The Wiki software can automatically generate a <a href="http://pl.wiktionary.org/wiki/table%20of%20contents">table of contents</a> from them.</p> <h3>Subsection</h3> -<p>Using more "equals" (=) signs creates a subsection.</p> +<p>Using more "equals" (=) signs creates a subsection.</p> <h4>A smaller subsection</h4> <p>Don't skip levels, diff --git a/testdata/headings.text b/testdata/headings.text new file mode 100644 index 0000000..e6c30ea --- /dev/null +++ b/testdata/headings.text @@ -0,0 +1,20 @@ + +** Section headings + +_Headings_ organize your writing into sections. The Wiki software can +automatically generate a table of contents from them. + + +*** Subsection + +Using more "equals" (=) signs creates a subsection. + + +**** A smaller subsection + +Don't skip levels, like from two to four equals signs. + +Start with 2 equals signs not 1 because 1 creates H1 tags which should be +reserved for page title. + + diff --git a/testdata/hz.text b/testdata/hz.text new file mode 100644 index 0000000..bdc1764 --- /dev/null +++ b/testdata/hz.text @@ -0,0 +1,8 @@ +You can make horizontal dividing lines (----) to separate text. + + + ------------------------------------------------------------------------- +But you should usually use sections instead, so that they go in the table of +contents. + + diff --git a/testdata/it.text b/testdata/it.text new file mode 100644 index 0000000..3de3b6b --- /dev/null +++ b/testdata/it.text @@ -0,0 +1,3 @@ +now is the time for _all good_ men to come to + + diff --git a/testdata/itbold1.text b/testdata/itbold1.text new file mode 100644 index 0000000..ef1dee6 --- /dev/null +++ b/testdata/itbold1.text @@ -0,0 +1,3 @@ +_a b C d_ + + diff --git a/testdata/itbold2.text b/testdata/itbold2.text new file mode 100644 index 0000000..f4ec078 --- /dev/null +++ b/testdata/itbold2.text @@ -0,0 +1,3 @@ +_A B c d_ + + diff --git a/testdata/itbold3.text b/testdata/itbold3.text new file mode 100644 index 0000000..619c8af --- /dev/null +++ b/testdata/itbold3.text @@ -0,0 +1,3 @@ +_a b C D_ + + diff --git a/testdata/nowiki-ind.text b/testdata/nowiki-ind.text new file mode 100644 index 0000000..9d7c468 --- /dev/null +++ b/testdata/nowiki-ind.text @@ -0,0 +1,7 @@ +Para + +a b c + +para + + diff --git a/testdata/nowiki-tag.html b/testdata/nowiki-tag.html index 58b96fc..47edb4a 100644 --- a/testdata/nowiki-tag.html +++ b/testdata/nowiki-tag.html @@ -1 +1,2 @@ -<p>A <tag></p>
\ No newline at end of file +<p>A <tag></p> + diff --git a/testdata/nowiki-tag.text b/testdata/nowiki-tag.text new file mode 100644 index 0000000..1a9bf08 --- /dev/null +++ b/testdata/nowiki-tag.text @@ -0,0 +1,3 @@ +A <tag> + + diff --git a/testdata/nowiki.text b/testdata/nowiki.text new file mode 100644 index 0000000..7a6814f --- /dev/null +++ b/testdata/nowiki.text @@ -0,0 +1,3 @@ +#:version=1.0_rest_ of line + + diff --git a/testdata/numlist.text b/testdata/numlist.text new file mode 100644 index 0000000..869fd9b --- /dev/null +++ b/testdata/numlist.text @@ -0,0 +1,9 @@ + + 1. _Numbered lists_ are: + 1. Very organized + 2. Easy to follow +A newline marks the end of the list. + + + 1. New numbering starts with 1. + diff --git a/testdata/para.text b/testdata/para.text new file mode 100644 index 0000000..65ec74a --- /dev/null +++ b/testdata/para.text @@ -0,0 +1,6 @@ +First paragraph consists of two sentences. Each sentence occupies a line. + +Second paragraph consists of two sentences as well. Each of them, again, +occupies its own line. + + diff --git a/testdata/unlist.text b/testdata/unlist.text new file mode 100644 index 0000000..5745cb0 --- /dev/null +++ b/testdata/unlist.text @@ -0,0 +1,12 @@ + + - _Unordered lists_ are easy to do: + - Start every line with a star. + - More stars indicate a deeper level. + Previous item continues. + - A newline + - in a list +marks the end of the list. + + + - Of course you can start again. + diff --git a/tests/test.py b/tests/test-html.py index c54a717..c54a717 100644 --- a/tests/test.py +++ b/tests/test-html.py diff --git a/tests/test-text.py b/tests/test-text.py new file mode 100644 index 0000000..09299e9 --- /dev/null +++ b/tests/test-text.py @@ -0,0 +1,123 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (C) 2008-2018 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from __future__ import print_function +import unittest +from WikiTrans.wiki2text import TextWikiMarkup, TextWiktionaryMarkup + +class TestMarkupParserBasic (unittest.TestCase): + + def test_colon(self): + self.assertTrue(self.__test('colon')) + pass + + def test_headings(self): + self.assertTrue(self.__test('headings')) + pass + + def test_hz(self): + self.assertTrue(self.__test('hz')) + pass + + def test_numlist(self): + self.assertTrue(self.__test('numlist')) + pass + + def test_unlist(self): + self.assertTrue(self.__test('unlist')) + pass + + def test_deflist(self): + self.assertTrue(self.__test('deflist')) + pass + + def test_para(self): + self.assertTrue(self.__test('para')) + pass + + def test_it(self): + self.assertTrue(self.__test('it')) + pass + + def test_bold(self): + self.assertTrue(self.__test('bold')) + pass + + def test_boldit1(self): + self.assertTrue(self.__test('boldit1')) + pass + + def test_itbold1(self): + self.assertTrue(self.__test('itbold1')) + pass + + def test_boldit2(self): + self.assertTrue(self.__test('boldit2')) + pass + + def test_itbold2(self): + self.assertTrue(self.__test('itbold2')) + pass + + def test_boldit3(self): + self.assertTrue(self.__test('boldit3')) + pass + + def test_itbold3(self): + self.assertTrue(self.__test('itbold3')) + pass + + def test_nowiki(self): + self.assertTrue(self.__test('nowiki')) + pass + + def test_nowiki_tag(self): + self.assertTrue(self.__test('nowiki-tag')) + pass + + def test_nowiki_ind(self): + self.assertTrue(self.__test('nowiki-ind')) + pass + + # def test_door(self): + # self.assertTrue(self.__test('door')) + # pass + + # def test_drzwi(self): + # self.assertTrue(self.__test('drzwi')) + # pass + + def __test(self, filename): + name_in = 'testdata/' + filename + '.wiki' + name_out = 'testdata/' + filename + '.text' + fh = open(name_out) + buf = ''.join(fh.readlines()).strip() + fh.close() + hwm = TextWiktionaryMarkup(filename=name_in, lang="pl") + hwm.parse() + + if str(hwm).strip() == buf: + return True + + # fail + print("\n>>>%s<<<" % buf) + print(">>>%s<<<" % str(hwm).strip()) + return False + +if __name__ == '__main__': + unittest.main() + |