summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2015-07-07 11:12:11 +0300
committerSergey Poznyakoff <gray@gnu.org.ua>2015-07-07 11:18:28 +0300
commit7d27e94627b40db4eb47b3d043c0865cfd7cb6d4 (patch)
tree89dd6ba44a2e2fc77eb89a63bcbf5cda39954086
parentb74b1d5fe2326f56a2e37f57c38b929307c71282 (diff)
downloadwikitrans-7d27e94627b40db4eb47b3d043c0865cfd7cb6d4.tar.gz
wikitrans-7d27e94627b40db4eb47b3d043c0865cfd7cb6d4.tar.bz2
Handle ambiguities between italic and bold markers, add more tests.
* wikimarkup.py (tokenize): Handle the three possible ambiguities in placing italic and bold markers. Redefine unresolved markers as text. * test.py: Add new tests. * testdata/boldit1.wiki: Rewrite. * testdata/boldit2.wiki: Rewrite. * testdata/boldit3.wiki: Rewrite. * testdata/boldit4.wiki: Remove. * testdata/boldit5.wiki: Remove, * testdata/bold.html: New file. * testdata/boldit0.html: New file. * testdata/boldit1.html: New file. * testdata/boldit3.html: New file. * testdata/it.html: New file. * testdata/itbold1.html: New file. * testdata/itbold1.wiki: New file. * testdata/itbold2.html: New file. * testdata/itbold2.wiki: New file. * testdata/itbold3.html: New file. * testdata/itbold3.wiki: New file. * testdata/para.html: New file.
-rw-r--r--test.py46
-rw-r--r--testdata/bold.html1
-rw-r--r--testdata/boldit0.html1
-rw-r--r--testdata/boldit1.html1
-rw-r--r--testdata/boldit1.wiki3
-rw-r--r--testdata/boldit2.wiki2
-rw-r--r--testdata/boldit3.html1
-rw-r--r--testdata/boldit3.wiki2
-rw-r--r--testdata/boldit4.wiki1
-rw-r--r--testdata/boldit5.wiki1
-rw-r--r--testdata/it.html1
-rw-r--r--testdata/itbold1.html1
-rw-r--r--testdata/itbold1.wiki1
-rw-r--r--testdata/itbold2.html1
-rw-r--r--testdata/itbold2.wiki1
-rw-r--r--testdata/itbold3.html1
-rw-r--r--testdata/itbold3.wiki1
-rw-r--r--testdata/para.html3
-rw-r--r--wikimarkup.py49
19 files changed, 98 insertions, 20 deletions
diff --git a/test.py b/test.py
index 65787e0..cf63ed7 100644
--- a/test.py
+++ b/test.py
@@ -38,23 +38,59 @@ class TestMarkupParserBasic (unittest.TestCase):
def test_unlist(self):
self.assert_(self.__test('unlist'))
pass
def test_deflist(self):
- self.assert_(self.__test('unlist'))
+ self.assert_(self.__test('deflist'))
+ pass
+
+ def test_para(self):
+ self.assert_(self.__test('para'))
+ pass
+
+ def test_it(self):
+ self.assert_(self.__test('it'))
pass
- def test_door(self):
- self.assert_(self.__test('door'))
+ def test_bold(self):
+ self.assert_(self.__test('bold'))
+ pass
+
+ def test_boldit1(self):
+ self.assert_(self.__test('boldit1'))
+ pass
+
+ def test_itbold1(self):
+ self.assert_(self.__test('itbold1'))
pass
- def test_drzwi(self):
- self.assert_(self.__test('drzwi'))
+ def test_boldit2(self):
+ self.assert_(self.__test('boldit2'))
pass
+ def test_itbold2(self):
+ self.assert_(self.__test('itbold2'))
+ pass
+
+ def test_boldit3(self):
+ self.assert_(self.__test('boldit3'))
+ pass
+
+ def test_itbold3(self):
+ self.assert_(self.__test('itbold3'))
+ pass
+
+ # def test_door(self):
+ # self.assert_(self.__test('door'))
+ # pass
+
+ # def test_drzwi(self):
+ # self.assert_(self.__test('drzwi'))
+ # pass
+
def __test(self, filename):
name_in = 'testdata/' + filename + '.wiki'
name_out = 'testdata/' + filename + '.html'
fh = open(name_out)
buf = ''.join(fh.readlines()).strip()
hwm = wiki2html.HtmlWiktionaryMarkup(filename=name_in, lang="pl")
diff --git a/testdata/bold.html b/testdata/bold.html
new file mode 100644
index 0000000..8fac382
--- /dev/null
+++ b/testdata/bold.html
@@ -0,0 +1 @@
+<p>now is the time for <b>all good</b> men to come to</p>
diff --git a/testdata/boldit0.html b/testdata/boldit0.html
new file mode 100644
index 0000000..cc19149
--- /dev/null
+++ b/testdata/boldit0.html
@@ -0,0 +1 @@
+<p>now is the time for <i><b>all good</b></i> men to come to</p>
diff --git a/testdata/boldit1.html b/testdata/boldit1.html
new file mode 100644
index 0000000..03b76ed
--- /dev/null
+++ b/testdata/boldit1.html
@@ -0,0 +1 @@
+<p><b>a b <i>c</i> d</b></p>
diff --git a/testdata/boldit1.wiki b/testdata/boldit1.wiki
index 6ac9262..1a36e08 100644
--- a/testdata/boldit1.wiki
+++ b/testdata/boldit1.wiki
@@ -1 +1,2 @@
-now is the time for ''all '''good''''' men to come to
+'''a b ''c'' d'''
+
diff --git a/testdata/boldit2.wiki b/testdata/boldit2.wiki
index 0cca5c3..df9e3b0 100644
--- a/testdata/boldit2.wiki
+++ b/testdata/boldit2.wiki
@@ -1 +1 @@
-now is the time for '''all ''good''''' men to come to
+'''''a b'' c d'''
diff --git a/testdata/boldit3.html b/testdata/boldit3.html
new file mode 100644
index 0000000..23534e8
--- /dev/null
+++ b/testdata/boldit3.html
@@ -0,0 +1 @@
+<p><b>a b <i>c d</i></b></p>
diff --git a/testdata/boldit3.wiki b/testdata/boldit3.wiki
index 49d8a7e..a3c60a7 100644
--- a/testdata/boldit3.wiki
+++ b/testdata/boldit3.wiki
@@ -1 +1 @@
-now is the time for ''all '''good''' men'' to come to
+'''a b ''c d'''''
diff --git a/testdata/boldit4.wiki b/testdata/boldit4.wiki
deleted file mode 100644
index d3725d0..0000000
--- a/testdata/boldit4.wiki
+++ /dev/null
@@ -1 +0,0 @@
-'''''Door''' files kan ik niet op tijd komen.''
diff --git a/testdata/boldit5.wiki b/testdata/boldit5.wiki
deleted file mode 100644
index d0d2c1c..0000000
--- a/testdata/boldit5.wiki
+++ /dev/null
@@ -1 +0,0 @@
-'''''Door'' files kan ik niet op tijd komen.'''
diff --git a/testdata/it.html b/testdata/it.html
new file mode 100644
index 0000000..c810e36
--- /dev/null
+++ b/testdata/it.html
@@ -0,0 +1 @@
+<p>now is the time for <i>all good</i> men to come to</p>
diff --git a/testdata/itbold1.html b/testdata/itbold1.html
new file mode 100644
index 0000000..03e5e68
--- /dev/null
+++ b/testdata/itbold1.html
@@ -0,0 +1 @@
+<p><i>a b <b>c</b> d</i></p>
diff --git a/testdata/itbold1.wiki b/testdata/itbold1.wiki
new file mode 100644
index 0000000..c1fb3da
--- /dev/null
+++ b/testdata/itbold1.wiki
@@ -0,0 +1 @@
+''a b '''c''' d'' \ No newline at end of file
diff --git a/testdata/itbold2.html b/testdata/itbold2.html
new file mode 100644
index 0000000..a755b30
--- /dev/null
+++ b/testdata/itbold2.html
@@ -0,0 +1 @@
+<p><i><b>a b</b> c d</i></p>
diff --git a/testdata/itbold2.wiki b/testdata/itbold2.wiki
new file mode 100644
index 0000000..c43dbb3
--- /dev/null
+++ b/testdata/itbold2.wiki
@@ -0,0 +1 @@
+'''''a b''' c d'' \ No newline at end of file
diff --git a/testdata/itbold3.html b/testdata/itbold3.html
new file mode 100644
index 0000000..61f3a66
--- /dev/null
+++ b/testdata/itbold3.html
@@ -0,0 +1 @@
+<p><i>a b <b>c d</b></i></p>
diff --git a/testdata/itbold3.wiki b/testdata/itbold3.wiki
new file mode 100644
index 0000000..8888dc0
--- /dev/null
+++ b/testdata/itbold3.wiki
@@ -0,0 +1 @@
+''a b '''c d''''' \ No newline at end of file
diff --git a/testdata/para.html b/testdata/para.html
new file mode 100644
index 0000000..cd3f732
--- /dev/null
+++ b/testdata/para.html
@@ -0,0 +1,3 @@
+<p>First paragraph consists of two sentences.
+Each sentence occupies a line.</p><p>Second paragraph consists of two sentences as well.
+Each of them, again, occupies its own line.</p>
diff --git a/wikimarkup.py b/wikimarkup.py
index 636012e..ba49eb5 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -131,35 +131,64 @@ class BaseWikiMarkup:
'content': line[pos:] })
line = None
def input(self):
return None
+ def swaptkn(self, i, j):
+ self.dprint(80, "SWAPPING %s <-> %s", i, j)
+ x = self.toklist[i]
+ self.toklist[i] = self.toklist[j]
+ self.toklist[j] = x
+
def tokenize(self):
self.toklist = []
for tok in self.tokread():
self.dprint(100, "TOK: %s", tok)
self.toklist.append(tok)
# Determine and fix up the ordering of bold and italic markers
- # This helps correctly parse inputs like:
- # '''''Door''' files kan ik niet op tijd komen.''
+ # There are three possible cases:
+ #
+ # 1a. '''a b ''c'' d'''
+ # 1b. ''a b '''c''' d''
+ #
+ # 2a. '''''a b'' c d'''
+ # 2b. '''''a b''' c d''
+ #
+ # 3a. '''a b ''c d'''''
+ # 3b. ''a b '''c d'''''
stack = []
for i in range(0,len(self.toklist)):
if self.toklist[i]['type'] == 'DELIM' \
and (self.toklist[i]['content'] == "''" \
or self.toklist[i]['content'] == "'''"):
- if len(stack) > 0 \
- and self.toklist[stack[-1]]['content'] == self.toklist[i]['content']:
- stack.pop()
- elif len(stack) > 1:
- x = self.toklist[stack[-2]]
- self.toklist[stack[-2]] = self.toklist[stack[-1]]
- self.toklist[stack[-1]] = x
- stack.pop()
+ if len(stack) > 0:
+ if self.toklist[stack[-1]]['content'] == self.toklist[i]['content']:
+ # Case 1: just pop the matching delimiter off the stack
+ stack.pop()
+ elif len(stack) == 2 and stack[-2] + 1 == stack[-1]:
+ # Case 2: swap delimiters saved on stack ...
+ self.swaptkn(stack[-2], stack[-1])
+ # and pop off the matching one
+ stack.pop()
+ elif i < len(self.toklist) \
+ and self.toklist[i+1]['type'] == 'DELIM' \
+ and self.toklist[stack[-1]]['content'] == self.toklist[i+1]['content']:
+ # Case 3: swap current and next tokens
+ self.swaptkn(i, i+1)
+ # and pop off the matching one
+ stack.pop()
+ else:
+ # Push the token on stack
+ stack.append(i)
else:
+ # Push the token on stack
stack.append(i)
+ # Redefine all non-matched tokens as TEXT
+ for i in stack:
+ self.toklist[i]['type'] = 'TEXT'
def peektkn(self):
return self.toklist[self.tokind]
def setkn(self,val):
self.toklist[self.tokind] = val

Return to:

Send suggestions and report system problems to the System administrator.