diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-07 11:12:11 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2015-07-07 11:18:28 +0300 |
commit | 7d27e94627b40db4eb47b3d043c0865cfd7cb6d4 (patch) | |
tree | 89dd6ba44a2e2fc77eb89a63bcbf5cda39954086 | |
parent | b74b1d5fe2326f56a2e37f57c38b929307c71282 (diff) | |
download | wikitrans-7d27e94627b40db4eb47b3d043c0865cfd7cb6d4.tar.gz wikitrans-7d27e94627b40db4eb47b3d043c0865cfd7cb6d4.tar.bz2 |
Handle ambiguities between italic and bold markers, add more tests.
* wikimarkup.py (tokenize): Handle the three possible
ambiguities in placing italic and bold markers.
Redefine unresolved markers as text.
* test.py: Add new tests.
* testdata/boldit1.wiki: Rewrite.
* testdata/boldit2.wiki: Rewrite.
* testdata/boldit3.wiki: Rewrite.
* testdata/boldit4.wiki: Remove.
* testdata/boldit5.wiki: Remove,
* testdata/bold.html: New file.
* testdata/boldit0.html: New file.
* testdata/boldit1.html: New file.
* testdata/boldit3.html: New file.
* testdata/it.html: New file.
* testdata/itbold1.html: New file.
* testdata/itbold1.wiki: New file.
* testdata/itbold2.html: New file.
* testdata/itbold2.wiki: New file.
* testdata/itbold3.html: New file.
* testdata/itbold3.wiki: New file.
* testdata/para.html: New file.
-rw-r--r-- | test.py | 46 | ||||
-rw-r--r-- | testdata/bold.html | 1 | ||||
-rw-r--r-- | testdata/boldit0.html | 1 | ||||
-rw-r--r-- | testdata/boldit1.html | 1 | ||||
-rw-r--r-- | testdata/boldit1.wiki | 3 | ||||
-rw-r--r-- | testdata/boldit2.wiki | 2 | ||||
-rw-r--r-- | testdata/boldit3.html | 1 | ||||
-rw-r--r-- | testdata/boldit3.wiki | 2 | ||||
-rw-r--r-- | testdata/boldit4.wiki | 1 | ||||
-rw-r--r-- | testdata/boldit5.wiki | 1 | ||||
-rw-r--r-- | testdata/it.html | 1 | ||||
-rw-r--r-- | testdata/itbold1.html | 1 | ||||
-rw-r--r-- | testdata/itbold1.wiki | 1 | ||||
-rw-r--r-- | testdata/itbold2.html | 1 | ||||
-rw-r--r-- | testdata/itbold2.wiki | 1 | ||||
-rw-r--r-- | testdata/itbold3.html | 1 | ||||
-rw-r--r-- | testdata/itbold3.wiki | 1 | ||||
-rw-r--r-- | testdata/para.html | 3 | ||||
-rw-r--r-- | wikimarkup.py | 45 |
19 files changed, 96 insertions, 18 deletions
@@ -41,17 +41,53 @@ class TestMarkupParserBasic (unittest.TestCase): pass def test_deflist(self): - self.assert_(self.__test('unlist')) + self.assert_(self.__test('deflist')) + pass + + def test_para(self): + self.assert_(self.__test('para')) + pass + + def test_it(self): + self.assert_(self.__test('it')) + pass + + def test_bold(self): + self.assert_(self.__test('bold')) + pass + + def test_boldit1(self): + self.assert_(self.__test('boldit1')) pass - def test_door(self): - self.assert_(self.__test('door')) + def test_itbold1(self): + self.assert_(self.__test('itbold1')) pass - def test_drzwi(self): - self.assert_(self.__test('drzwi')) + def test_boldit2(self): + self.assert_(self.__test('boldit2')) pass + def test_itbold2(self): + self.assert_(self.__test('itbold2')) + pass + + def test_boldit3(self): + self.assert_(self.__test('boldit3')) + pass + + def test_itbold3(self): + self.assert_(self.__test('itbold3')) + pass + + # def test_door(self): + # self.assert_(self.__test('door')) + # pass + + # def test_drzwi(self): + # self.assert_(self.__test('drzwi')) + # pass + def __test(self, filename): name_in = 'testdata/' + filename + '.wiki' name_out = 'testdata/' + filename + '.html' diff --git a/testdata/bold.html b/testdata/bold.html new file mode 100644 index 0000000..8fac382 --- /dev/null +++ b/testdata/bold.html @@ -0,0 +1 @@ +<p>now is the time for <b>all good</b> men to come to</p> diff --git a/testdata/boldit0.html b/testdata/boldit0.html new file mode 100644 index 0000000..cc19149 --- /dev/null +++ b/testdata/boldit0.html @@ -0,0 +1 @@ +<p>now is the time for <i><b>all good</b></i> men to come to</p> diff --git a/testdata/boldit1.html b/testdata/boldit1.html new file mode 100644 index 0000000..03b76ed --- /dev/null +++ b/testdata/boldit1.html @@ -0,0 +1 @@ +<p><b>a b <i>c</i> d</b></p> diff --git a/testdata/boldit1.wiki b/testdata/boldit1.wiki index 6ac9262..1a36e08 100644 --- a/testdata/boldit1.wiki +++ b/testdata/boldit1.wiki @@ -1 +1,2 @@ -now is the time for ''all '''good''''' men to come to +'''a b ''c'' d''' + diff --git a/testdata/boldit2.wiki b/testdata/boldit2.wiki index 0cca5c3..df9e3b0 100644 --- a/testdata/boldit2.wiki +++ b/testdata/boldit2.wiki @@ -1 +1 @@ -now is the time for '''all ''good''''' men to come to +'''''a b'' c d''' diff --git a/testdata/boldit3.html b/testdata/boldit3.html new file mode 100644 index 0000000..23534e8 --- /dev/null +++ b/testdata/boldit3.html @@ -0,0 +1 @@ +<p><b>a b <i>c d</i></b></p> diff --git a/testdata/boldit3.wiki b/testdata/boldit3.wiki index 49d8a7e..a3c60a7 100644 --- a/testdata/boldit3.wiki +++ b/testdata/boldit3.wiki @@ -1 +1 @@ -now is the time for ''all '''good''' men'' to come to +'''a b ''c d''''' diff --git a/testdata/boldit4.wiki b/testdata/boldit4.wiki deleted file mode 100644 index d3725d0..0000000 --- a/testdata/boldit4.wiki +++ /dev/null @@ -1 +0,0 @@ -'''''Door''' files kan ik niet op tijd komen.'' diff --git a/testdata/boldit5.wiki b/testdata/boldit5.wiki deleted file mode 100644 index d0d2c1c..0000000 --- a/testdata/boldit5.wiki +++ /dev/null @@ -1 +0,0 @@ -'''''Door'' files kan ik niet op tijd komen.''' diff --git a/testdata/it.html b/testdata/it.html new file mode 100644 index 0000000..c810e36 --- /dev/null +++ b/testdata/it.html @@ -0,0 +1 @@ +<p>now is the time for <i>all good</i> men to come to</p> diff --git a/testdata/itbold1.html b/testdata/itbold1.html new file mode 100644 index 0000000..03e5e68 --- /dev/null +++ b/testdata/itbold1.html @@ -0,0 +1 @@ +<p><i>a b <b>c</b> d</i></p> diff --git a/testdata/itbold1.wiki b/testdata/itbold1.wiki new file mode 100644 index 0000000..c1fb3da --- /dev/null +++ b/testdata/itbold1.wiki @@ -0,0 +1 @@ +''a b '''c''' d''
\ No newline at end of file diff --git a/testdata/itbold2.html b/testdata/itbold2.html new file mode 100644 index 0000000..a755b30 --- /dev/null +++ b/testdata/itbold2.html @@ -0,0 +1 @@ +<p><i><b>a b</b> c d</i></p> diff --git a/testdata/itbold2.wiki b/testdata/itbold2.wiki new file mode 100644 index 0000000..c43dbb3 --- /dev/null +++ b/testdata/itbold2.wiki @@ -0,0 +1 @@ +'''''a b''' c d''
\ No newline at end of file diff --git a/testdata/itbold3.html b/testdata/itbold3.html new file mode 100644 index 0000000..61f3a66 --- /dev/null +++ b/testdata/itbold3.html @@ -0,0 +1 @@ +<p><i>a b <b>c d</b></i></p> diff --git a/testdata/itbold3.wiki b/testdata/itbold3.wiki new file mode 100644 index 0000000..8888dc0 --- /dev/null +++ b/testdata/itbold3.wiki @@ -0,0 +1 @@ +''a b '''c d'''''
\ No newline at end of file diff --git a/testdata/para.html b/testdata/para.html new file mode 100644 index 0000000..cd3f732 --- /dev/null +++ b/testdata/para.html @@ -0,0 +1,3 @@ +<p>First paragraph consists of two sentences. +Each sentence occupies a line.</p><p>Second paragraph consists of two sentences as well. +Each of them, again, occupies its own line.</p> diff --git a/wikimarkup.py b/wikimarkup.py index 636012e..ba49eb5 100644 --- a/wikimarkup.py +++ b/wikimarkup.py @@ -134,29 +134,58 @@ class BaseWikiMarkup: def input(self): return None + def swaptkn(self, i, j): + self.dprint(80, "SWAPPING %s <-> %s", i, j) + x = self.toklist[i] + self.toklist[i] = self.toklist[j] + self.toklist[j] = x + def tokenize(self): self.toklist = [] for tok in self.tokread(): self.dprint(100, "TOK: %s", tok) self.toklist.append(tok) # Determine and fix up the ordering of bold and italic markers - # This helps correctly parse inputs like: - # '''''Door''' files kan ik niet op tijd komen.'' + # There are three possible cases: + # + # 1a. '''a b ''c'' d''' + # 1b. ''a b '''c''' d'' + # + # 2a. '''''a b'' c d''' + # 2b. '''''a b''' c d'' + # + # 3a. '''a b ''c d''''' + # 3b. ''a b '''c d''''' stack = [] for i in range(0,len(self.toklist)): if self.toklist[i]['type'] == 'DELIM' \ and (self.toklist[i]['content'] == "''" \ or self.toklist[i]['content'] == "'''"): - if len(stack) > 0 \ - and self.toklist[stack[-1]]['content'] == self.toklist[i]['content']: + if len(stack) > 0: + if self.toklist[stack[-1]]['content'] == self.toklist[i]['content']: + # Case 1: just pop the matching delimiter off the stack stack.pop() - elif len(stack) > 1: - x = self.toklist[stack[-2]] - self.toklist[stack[-2]] = self.toklist[stack[-1]] - self.toklist[stack[-1]] = x + elif len(stack) == 2 and stack[-2] + 1 == stack[-1]: + # Case 2: swap delimiters saved on stack ... + self.swaptkn(stack[-2], stack[-1]) + # and pop off the matching one stack.pop() + elif i < len(self.toklist) \ + and self.toklist[i+1]['type'] == 'DELIM' \ + and self.toklist[stack[-1]]['content'] == self.toklist[i+1]['content']: + # Case 3: swap current and next tokens + self.swaptkn(i, i+1) + # and pop off the matching one + stack.pop() + else: + # Push the token on stack + stack.append(i) else: + # Push the token on stack stack.append(i) + # Redefine all non-matched tokens as TEXT + for i in stack: + self.toklist[i]['type'] = 'TEXT' def peektkn(self): return self.toklist[self.tokind] |