summaryrefslogtreecommitdiffabout
authorSergey Poznyakoff <gray@gnu.org.ua>2015-07-12 20:11:40 (GMT)
committer Sergey Poznyakoff <gray@gnu.org.ua>2015-07-12 20:11:40 (GMT)
commit28072898f1bd9a925d73ac187d560198d6345524 (patch) (unidiff)
treea46d781fb85d9dda61fc8f68e0ba6ec43d60ce55
parent75672b57a2d63f01d00795fe8d661d1efe7b6e8d (diff)
downloadwit-28072898f1bd9a925d73ac187d560198d6345524.tar.gz
wit-28072898f1bd9a925d73ac187d560198d6345524.tar.bz2
Improve tag handling and debugging
* wikimarkup.py: Rewrite tag recognition. Implement dump method. * wikicvt.py: New options -D (--dump), and -t dump * wiki2html.py (input_tag): Remove method (str_tag): Change handling of tags * wiki2texi.py: Likewise. * wiki2text.py: Likewise.
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--wiki2html.py28
-rw-r--r--wiki2texi.py37
-rw-r--r--wiki2text.py27
-rwxr-xr-xwikicvt.py26
-rw-r--r--wikimarkup.py317
5 files changed, 309 insertions, 126 deletions
diff --git a/wiki2html.py b/wiki2html.py
index 441bc76..66939c4 100644
--- a/wiki2html.py
+++ b/wiki2html.py
@@ -169,31 +169,27 @@ class HtmlWikiMarkup (WikiMarkup):
169 self.envt[type]["elt"][n]) 169 self.envt[type]["elt"][n])
170 return "<%s>%s</%s>" % (self.envt[type]["hdr"], 170 return "<%s>%s</%s>" % (self.envt[type]["hdr"],
171 string, 171 string,
172 self.envt[type]["hdr"]) 172 self.envt[type]["hdr"])
173 return string 173 return string
174 174
175 supported_tags = [ 'nowiki', 'code' ]
176 def input_tag(self, tag):
177 return tag['tag'] in self.supported_tags
178
179 def str_tag(self, elt): 175 def str_tag(self, elt):
180 if elt['tag'] == 'nowiki': 176 if elt['tag'] == 'nowiki':
181 return '<pre>' + elt['content'] + '</pre>' 177 return '<pre>' + self.format(elt['content']) + '</pre>'
182 elif elt['tag'] == 'code': 178 elif elt['tag'] == 'code':
183 kwdict = { 179 self.nested += 1
184 'nested': self.nested + 1, 180 s = self.format(elt['content'])
185 'lang': self.lang, 181 self.nested -= 1
186 'text': elt['content'], 182 return '<pre><code>' + s + '</code></pre>' #FIXME
187 'html_base': self.html_base, 183 else:
188 'image_base': self.image_base, 184 s = '<' + elt['tag']
189 'media_base': self.media_base } 185 if elt['args']:
190 markup = HtmlWiktionaryMarkup(**kwdict) 186 s += ' ' + elt['args']
191 markup.debug_level = self.debug_level 187 s += '>'
192 markup.parse() 188 s += self.format(elt['content'])
193 return '<pre><code>' + str(markup) + '</code></pre>' #FIXME 189 return s + '</' + elt['tag'] + '>'
194 190
195 def str_para(self, elt): 191 def str_para(self, elt):
196 string = ""; 192 string = "";
197 for x in elt['content']: 193 for x in elt['content']:
198 string += self.format(x) 194 string += self.format(x)
199 return "<p>" + string + "</p>" 195 return "<p>" + string + "</p>"
diff --git a/wiki2texi.py b/wiki2texi.py
index 7cc67bd..0b3eb77 100644
--- a/wiki2texi.py
+++ b/wiki2texi.py
@@ -116,35 +116,34 @@ class TexiWikiMarkup (WikiMarkup):
116 for x in elt['content']: 116 for x in elt['content']:
117 string += self.format(x) 117 string += self.format(x)
118 return string 118 return string
119 else: 119 else:
120 return str(elt) 120 return str(elt)
121 121
122 supported_tags = [ 'nowiki', 'code' ]
123 def input_tag(self, tag):
124 return tag['tag'] in self.supported_tags
125
126 def str_tag(self, elt): 122 def str_tag(self, elt):
127 if elt['tag'] == 'nowiki': 123 if elt['tag'] == 'nowiki':
128 return '@example\n' + elt['content'] + '@end example\n' 124 return '@example\n' + self.format(elt['content']) + '@end example\n'
129 elif elt['tag'] == 'code': 125 elif elt['tag'] == 'code':
130 kwdict = { 126 self.nested += 1
131 'nested': self.nested + 1, 127 s = self.format(elt['content'])
132 'lang': self.lang, 128 self.nested -= 1
133 'text': elt['content'],
134 'html_base': self.html_base,
135 'image_base': self.image_base,
136 'media_base': self.media_base }
137 markup = TexiWikiMarkup(**kwdict)
138 markup.debug_level = self.debug_level
139 markup.parse()
140 s = str(markup)
141 if not s.endswith("\n"): 129 if not s.endswith("\n"):
142 s += "\n"; 130 s += "\n"
143 return '@example\n' + s + '@end example\n' 131 return '@example\n' + s + '@end example\n'
144 132 elif elt['tag'] == 'tt':
133 self.nested += 1
134 s = self.format(elt['content'])
135 self.nested -= 1
136 return "@code{%s}" % s
137 else:
138 s = '<' + elt['tag']
139 if elt['args']:
140 s += ' ' + elt['args']
141 s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
142 return s
143
145 def str_para(self, elt): 144 def str_para(self, elt):
146 string = ""; 145 string = "";
147 for x in elt['content']: 146 for x in elt['content']:
148 string += self.format(x) 147 string += self.format(x)
149 return "\n" + string + "\n" 148 return "\n" + string + "\n"
150 149
@@ -153,13 +152,13 @@ class TexiWikiMarkup (WikiMarkup):
153 for x in elt['content']: 152 for x in elt['content']:
154 string += self.format(x) 153 string += self.format(x)
155 if self.nested: 154 if self.nested:
156 return string 155 return string
157 if not string.endswith("\n"): 156 if not string.endswith("\n"):
158 string += "\n"; 157 string += "\n";
159 return '@example\n' + string + '@end example\n' 158 return '\n@example\n' + string + '@end example\n'
160 159
161 def concat(self, eltlist): 160 def concat(self, eltlist):
162 string = "" 161 string = ""
163 for x in eltlist: 162 for x in eltlist:
164 string += self.format(x) 163 string += self.format(x)
165 return string 164 return string
diff --git a/wiki2text.py b/wiki2text.py
index 27a7051..d4cab81 100644
--- a/wiki2text.py
+++ b/wiki2text.py
@@ -139,31 +139,26 @@ class TextWikiMarkup (WikiMarkup):
139 length = 0 139 length = 0
140 linebuf = "" 140 linebuf = ""
141 linebuf += " " * wsc + s 141 linebuf += " " * wsc + s
142 length += wsc + wlen 142 length += wsc + wlen
143 return output + linebuf 143 return output + linebuf
144 144
145 supported_tags = [ 'nowiki', 'code' ]
146 def input_tag(self, tag):
147 return tag['tag'] in self.supported_tags
148
149 def str_tag(self, elt): 145 def str_tag(self, elt):
150 if elt['tag'] == 'nowiki': 146 if elt['tag'] == 'nowiki':
151 return elt['content'] 147 return self.format(elt['content'])
152 elif elt['tag'] == 'code': 148 elif elt['tag'] == 'code':
153 kwdict = { 149 self.nested += 1
154 'nested': self.nested + 1, 150 s = self.format(elt['content'])
155 'lang': self.lang, 151 self.nested -= 1
156 'text': elt['content'], 152 return s #FIXME
157 'html_base': self.html_base, 153 else:
158 'image_base': self.image_base, 154 s = '<' + elt['tag']
159 'media_base': self.media_base } 155 if elt['args']:
160 markup = TextWiktionaryMarkup(**kwdict) 156 s += ' ' + elt['args']
161 markup.debug_level = self.debug_level 157 s += '>' + self.format(elt['content']) + '</' + elt['tag'] + '>'
162 markup.parse() 158 return s
163 return str(markup)
164 159
165 def format(self, elt): 160 def format(self, elt):
166 if elt['type'] == 'TEXT': 161 if elt['type'] == 'TEXT':
167 if isinstance(elt['content'],list): 162 if isinstance(elt['content'],list):
168 string = "" 163 string = ""
169 for s in elt['content']: 164 for s in elt['content']:
diff --git a/wikicvt.py b/wikicvt.py
index e61e28b..c8ca887 100755
--- a/wikicvt.py
+++ b/wikicvt.py
@@ -14,24 +14,37 @@
14# 14#
15# You should have received a copy of the GNU General Public License 15# You should have received a copy of the GNU General Public License
16# along with this program. If not, see <http://www.gnu.org/licenses/>. 16# along with this program. If not, see <http://www.gnu.org/licenses/>.
17 17
18import sys 18import sys
19import getopt 19import getopt
20import StringIO
20from wiki2html import * 21from wiki2html import *
21from wiki2text import * 22from wiki2text import *
22from wiki2texi import * 23from wiki2texi import *
23 24
25class DumpWikiMarkup (WikiMarkup):
26 def __str__(self):
27 if self.tree:
28 s = StringIO.StringIO()
29 self.dump(self.tree, 0, s)
30 return s.getvalue()
31 else:
32 return ""
33
24def usage(code=0): 34def usage(code=0):
25 print """ 35 print """
26usage: %s [-hvt] [-I INTYPE] [-l lang] [-o kw=val] [--lang=lang] [--option kw=val] 36usage: %s [-hvt] [-I INTYPE] [-l lang] [-o kw=val] [--lang=lang] [--option kw=val]
27 [--input-type=INTYPE] [--type=OUTTYPE] [--help] [--verbose] file 37 [--input-type=INTYPE] [--type=OUTTYPE] [--help] [--verbose] file
28""" % (sys.argv[0]) 38""" % (sys.argv[0])
29 sys.exit(code) 39 sys.exit(code)
30 40
31handlers = { 41handlers = {
42 'dump': {
43 'default': DumpWikiMarkup
44 },
32 'html': { 45 'html': {
33 'default': HtmlWikiMarkup, 46 'default': HtmlWikiMarkup,
34 'wiktionary': HtmlWiktionaryMarkup 47 'wiktionary': HtmlWiktionaryMarkup
35 }, 48 },
36 'text': { 49 'text': {
37 'default': TextWikiMarkup, 50 'default': TextWikiMarkup,
@@ -48,15 +61,16 @@ def main():
48 otype = 'html' 61 otype = 'html'
49 lang = "pl" 62 lang = "pl"
50 kwdict = {} 63 kwdict = {}
51 debug = 0 64 debug = 0
52 65
53 try: 66 try:
54 opts, args = getopt.getopt(sys.argv[1:], "d:I:hl:o:t:v", 67 opts, args = getopt.getopt(sys.argv[1:], "Dd:I:hl:o:t:v",
55 ["debug=", "help", "lang=", "option=", 68 ["dump",
56 "to", "type", "input-text", "input-type", 69 "debug=", "help", "lang=", "option=",
70 "to=", "type=", "input-text", "input-type=",
57 "verbose" ]) 71 "verbose" ])
58 except getopt.GetoptError: 72 except getopt.GetoptError:
59 usage(1) 73 usage(1)
60 74
61 for o, a in opts: 75 for o, a in opts:
62 if o in ("-h", "--help"): 76 if o in ("-h", "--help"):
@@ -74,25 +88,27 @@ def main():
74 if val != '': 88 if val != '':
75 kwdict[kw] = val 89 kwdict[kw] = val
76 elif o == "--input-text": 90 elif o == "--input-text":
77 input_text = True 91 input_text = True
78 elif o in ("-d", "--debug"): 92 elif o in ("-d", "--debug"):
79 debug = eval(a) 93 debug = eval(a)
94 elif o in ("-D", "--dump"):
95 otype = 'dump'
80 96
81 if len(args) == 1: 97 if len(args) == 1:
82 if args[0] == '-': 98 if args[0] == '-':
83 kwdict['file'] = sys.stdin 99 kwdict['file'] = sys.stdin
84 else: 100 else:
85 kwdict['filename'] = args[0] 101 kwdict['filename'] = args[0]
86 else: 102 else:
87 usage(1) 103 usage(1)
88 104
89 kwdict['lang']=lang 105 kwdict['lang']=lang
90 106
91 if handlers.has_key(otype): 107 if otype in handlers:
92 if handlers[otype].has_key(itype): 108 if itype in handlers[otype]:
93 markup = handlers[otype][itype](**kwdict) 109 markup = handlers[otype][itype](**kwdict)
94 markup.debug_level = debug 110 markup.debug_level = debug
95 markup.parse() 111 markup.parse()
96 print str(markup) 112 print str(markup)
97 exit(0) 113 exit(0)
98 else: 114 else:
diff --git a/wikimarkup.py b/wikimarkup.py
index fde1ec1..9a79d1e 100644
--- a/wikimarkup.py
+++ b/wikimarkup.py
@@ -19,15 +19,15 @@ import sys
19import re 19import re
20from types import * 20from types import *
21 21
22__all__ = [ "BaseWikiMarkup", "WikiMarkup", 22__all__ = [ "BaseWikiMarkup", "WikiMarkup",
23 "envtypes" ] 23 "envtypes" ]
24 24
25delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)") 25delim = re.compile("^==+|==+[ \\t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<")
26otag = re.compile("^\s*<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>") 26otag = re.compile("(?P<pfx>[^<]*)<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>.+))?\s*(?P<closed>/)?>")
27ctag = re.compile("^\s*</(?P<tag>[a-zA-Z0-9_]+)\s*>") 27ctag = re.compile("(?P<pfx>[^<]*)</(?P<tag>[a-zA-Z0-9_]+)\s*>")
28 28
29close_delim = { 29close_delim = {
30 '[': ']', 30 '[': ']',
31 '[[': ']]', 31 '[[': ']]',
32 '{{': '}}' 32 '{{': '}}'
33} 33}
@@ -43,22 +43,121 @@ class BaseWikiMarkup(object):
43 43
44 toklist = None 44 toklist = None
45 tokind = 0 45 tokind = 0
46 newline = 0 46 newline = 0
47 tree = None 47 tree = None
48 48
49 tags = [ 'code', 'nowiki', 'tt', 'div' ]
50
49 nested = 0 51 nested = 0
50 debug_level = 0 52 debug_level = 0
51 53
52 def dprint(self, lev, fmt, *argv): 54 def dprint(self, lev, fmt, *argv):
53 if self.debug_level >= lev: 55 if self.debug_level >= lev:
54 print "[DEBUG]", fmt % argv 56 print "[DEBUG]", fmt % argv
55 57
56 def input_tag(self, tag): 58 def print_dump_prefix(self, level, file):
59 file.write("[DUMP]" + ' ' * (2*level + 1))
60
61 def dump_nil(self, node, level, file):
57 pass 62 pass
58 63
64 def dump_text(self, node, level, file):
65 self.print_dump_prefix(level, file)
66 file.write("CONTENT: \"%s\"\n" % node['content'])
67
68 def dump_delim(self, node, level, file):
69 file.write("'%s'" % node['content'])
70 if 'continuation' in node:
71 file.write(" (cont)")
72 file.write("\n")
73
74 def dump_tag(self, node, level, file):
75 self.print_dump_prefix(level, file)
76 file.write("TAG: %s\n" % node['tag'])
77 if 'args' in node:
78 self.print_dump_prefix(level, file)
79 file.write("ARGS: %s\n" % node['args'])
80 if 'content' in node:
81 self.dump_node(node['content'], level + 1, file)
82
83 def dump_seq(self, node, level, file):
84 self.dump(node['content'], level + 1, file)
85
86 def dump_ref(self, node, level, file):
87 self.print_dump_prefix(level, file)
88 file.write("REF: %s\n" % node['ref'])
89 self.dump_node(node['content'], level + 1, file)
90
91 def dump_hdr(self, node, level, file):
92 self.print_dump_prefix(level, file)
93 file.write("LEVEL: %s\n" % node['level'])
94 self.dump_node(node['content'], level + 1, file)
95
96 def dump_elt(self, node, level, file):
97 self.print_dump_prefix(level, file)
98 file.write("SUBTYPE: %s\n" % node['subtype'])
99 self.dump_node(node['content'], level + 1, file)
100
101 def dump_env(self, node, level, file):
102 self.print_dump_prefix(level, file)
103 file.write("ENVTYPE: %s\n" % node['envtype'])
104 self.print_dump_prefix(level, file)
105 file.write("LEVEL: %s\n" % node['level'])
106 self.dump(node['content'], level + 1, file)
107
108 def dump_ind(self, node, level, file):
109 self.print_dump_prefix(level, file)
110 file.write("LEVEL: %s\n" % node['level'])
111 self.dump_node(node['content'], level + 1, file)
112
113 def dump_link(self, node, level, file):
114 self.dump(node['content'], level + 1, file)
115
116 dump_type = {
117 'NIL': dump_nil,
118 'NL': dump_nil,
119 'TEXT': dump_text,
120 'DELIM': dump_delim,
121 'OTAG': dump_tag,
122 'CTAG': dump_tag,
123 'TAG': dump_tag,
124 'SEQ': dump_seq,
125 'REF': dump_ref,
126 'HDR': dump_hdr,
127 'ELT': dump_elt,
128 'ENV': dump_env,
129 'IND': dump_ind,
130 'BAR': dump_nil,
131 'PARA': dump_seq,
132 'PRE': dump_text,
133 'BOLD': dump_seq,
134 'IT': dump_seq,
135 'LINK': dump_link,
136 }
137
138 def dump_node(self, node, level, file):
139 if type(node) != dict:
140 file.write("UNHANDLED NODE: %s, %s\n" % (type(node),node))
141 return
142
143 self.print_dump_prefix(level, file)
144 file.write("NODE " + node['type'] + ":\n")
145 if node['type'] in self.dump_type:
146 self.dump_type[node['type']](self, node, level, file)
147 else:
148 self.print_dump_prefix(level, file)
149 file.write("(UNHANDLED) ")
150 file.write("%s\n" % node)
151 self.print_dump_prefix(level, file)
152 file.write("END NODE " + node['type'] + "\n")
153
154 def dump(self, tree, level=0, file=sys.stdout):
155 for node in tree:
156 self.dump_node(node, level, file)
157
59 def tokread(self): 158 def tokread(self):
60 line = None 159 line = None
61 pos = 0 160 pos = 0
62 while 1: 161 while 1:
63 if (not line or pos == len(line)): 162 if (not line or pos == len(line)):
64 try: 163 try:
@@ -80,58 +179,78 @@ class BaseWikiMarkup(object):
80 m = delim.search(line, pos) 179 m = delim.search(line, pos)
81 180
82 if m: 181 if m:
83 if (pos < m.start(0)): 182 if (pos < m.start(0)):
84 yield({'type': 'TEXT', 'content': line[pos:m.start(0)]}) 183 yield({'type': 'TEXT', 'content': line[pos:m.start(0)]})
85 pos = m.end(0) 184 pos = m.end(0)
86 if envtypes.has_key(m.group(0)[0]) and line[pos] == ":": 185
87 # FIXME? 186 if m and line[m.start(0)] != '<':
88 # FIXME: What's "extra"? 187 if m.group(0)[0] in envtypes and pos < len(line) and line[pos] == ":":
89 yield({ 'type': 'DELIM', 188 yield({ 'type': 'DELIM',
90 'content': m.group(0) }) 189 'content': m.group(0),
190 'continuation': True })
91 pos += 1 191 pos += 1
92 else: 192 else:
93 yield({ 'type': 'DELIM', 193 yield({ 'type': 'DELIM',
94 'content': m.group(0) }) 194 'content': m.group(0) })
95 else: 195 else:
96 m = otag.match(line)
97 if m: 196 if m:
98 t = { 'type': 'TAG', 197 pos -= 1
198 t = None
199 m = otag.match(line, pos)
200 if m and m.group('tag') in self.tags:
201 rest = line[m.end(0):]
202 line = m.group('pfx')
203 pos = 0
204 t = { 'type': 'OTAG',
99 'tag': m.group('tag'), 205 'tag': m.group('tag'),
100 'args': m.group('args') } 206 'args': m.group('args') }
101 207 else:
102 if self.input_tag(t): 208 m = ctag.match(line, pos)
209 if m and m.group('tag') in self.tags:
210 rest = line[m.end(0):]
211 line = m.group('pfx')
212 pos = 0
213 t = { 'type': 'CTAG',
214 'tag': m.group('tag') }
215
216 if line:
217 if line[-1] == '\n':
218 if line[pos:-1] != '':
219 yield({ 'type': 'TEXT',
220 'content': line[pos:-1] })
221 yield({ 'type': 'NL',
222 'content': '\n' })
223 else:
224 yield({ 'type': 'TEXT',
225 'content': line[pos:] })
226
227 if t:
228 if t['type'] == 'OTAG' and t['tag'] == 'nowiki':
103 s = '' 229 s = ''
104 if not m.group('closed'): 230 if not m.group('closed'):
105 while 1: 231 while 1:
106 try: 232 try:
107 l = self.input() 233 l = self.input()
108 m = ctag.match(l) 234 m = ctag.match(l)
109 if m and m.group('tag') == t['tag']: 235 if m and m.group('tag') == t['tag']:
110 break 236 break
111 s += l 237 s += l
112 except StopIteration: 238 except StopIteration:
113 break 239 break
114 yield({ 'type': 'TAG', 240 t['type'] = 'TAG'
115 'tag': t['tag'], 241 t['content'] = {'type': 'TEXT', 'content': s}
116 'args': t['args'], 242
117 'content': s 243 yield(t)
118 }) 244 if t['type'] == 'OTAG' and m.group('closed'):
119 line = None 245 t['type'] = 'CTAG'
120 continue 246 yield(t)
121 247 line = rest
122 if line[-1] == '\n': 248 pos = 0
123 if line[pos:-1] != '':
124 yield({ 'type': 'TEXT',
125 'content': line[pos:-1] })
126 yield({ 'type': 'NL',
127 'content': '\n' })
128 else: 249 else:
129 yield({ 'type': 'TEXT', 250 line = None
130 'content': line[pos:] })
131 line = None
132 251
133 def input(self): 252 def input(self):
134 return None 253 return None
135 254
136 def swaptkn(self, i, j): 255 def swaptkn(self, i, j):
137 self.dprint(80, "SWAPPING %s <-> %s", i, j) 256 self.dprint(80, "SWAPPING %s <-> %s", i, j)
@@ -191,38 +310,39 @@ class BaseWikiMarkup(object):
191 310
192 def setkn(self,val): 311 def setkn(self,val):
193 self.toklist[self.tokind] = val 312 self.toklist[self.tokind] = val
194 313
195 def getkn(self): 314 def getkn(self):
196 self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' 315 self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
316 if self.tokind == len(self.toklist):
317 return { 'type': 'NIL' }
197 tok = self.toklist[self.tokind] 318 tok = self.toklist[self.tokind]
198 if tok['type'] != 'NIL': 319 self.tokind = self.tokind + 1
199 self.tokind = self.tokind + 1
200 return tok 320 return tok
201 321
202 def ungetkn(self): 322 def ungetkn(self):
203 self.tokind = self.tokind - 1 323 self.tokind = self.tokind - 1
204 self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL' 324 self.newline = self.tokind == 0 or self.toklist[self.tokind-1]['type'] == 'NL'
205 return self.toklist[self.tokind] 325 return self.toklist[self.tokind]
206 326
207 def parse_fontmod(self,delim,what): 327 def parse_fontmod(self,delim,what):
208 self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s", 328 self.dprint(80, "ENTER parse_fontmod(%s,%s), tok %s",
209 delim, what, self.peektkn()) 329 delim, what, self.peektkn())
210 seq = [] 330 seq = []
211 textlist = [] 331 text = ''
212 while 1: 332 while 1:
213 tok = self.getkn() 333 tok = self.getkn()
214 if tok['type'] == 'TEXT': 334 if tok['type'] == 'TEXT':
215 textlist.append(tok['content']) 335 text += tok['content']
216 elif tok['type'] == 'DELIM': 336 elif tok['type'] == 'DELIM':
217 if tok['content'] == delim: 337 if tok['content'] == delim:
218 break 338 break
219 elif self.is_inline_delim(tok): 339 elif self.is_inline_delim(tok):
220 if textlist: 340 if text:
221 seq.append({ 'type': 'TEXT', 'content': textlist }) 341 seq.append({ 'type': 'TEXT', 'content': text })
222 textlist = [] 342 text = ''
223 x = self.parse_inline(tok) 343 x = self.parse_inline(tok)
224 if x: 344 if x:
225 seq.append(x) 345 seq.append(x)
226 else: 346 else:
227 self.dprint(80, "LEAVE parse_fontmod=%s", "None") 347 self.dprint(80, "LEAVE parse_fontmod=%s", "None")
228 return None 348 return None
@@ -234,14 +354,14 @@ class BaseWikiMarkup(object):
234 self.dprint(80, "LEAVE parse_fontmod=None") 354 self.dprint(80, "LEAVE parse_fontmod=None")
235 return None 355 return None
236 seq.append({ 'type': 'TEXT', 'content': '\n' }) 356 seq.append({ 'type': 'TEXT', 'content': '\n' })
237 else: 357 else:
238 self.dprint(80, "LEAVE parse_fontmod=None") 358 self.dprint(80, "LEAVE parse_fontmod=None")
239 return None 359 return None
240 if textlist: 360 if text:
241 seq.append({ 'type': 'TEXT', 'content': textlist }) 361 seq.append({ 'type': 'TEXT', 'content': text })
242 res = { 'type': what, 'content': seq } 362 res = { 'type': what, 'content': seq }
243 self.dprint(80, "LEAVE parse_fontmod=%s", res) 363 self.dprint(80, "LEAVE parse_fontmod=%s", res)
244 return res 364 return res
245 365
246 def parse_link(self, type, delim): 366 def parse_link(self, type, delim):
247 self.dprint(80, "ENTER parse_link(%s,%s), tok %s", 367 self.dprint(80, "ENTER parse_link(%s,%s), tok %s",
@@ -340,22 +460,28 @@ class BaseWikiMarkup(object):
340 460
341 def parse_para(self): 461 def parse_para(self):
342 self.dprint(80, "ENTER parse_para, tok %s", self.peektkn()) 462 self.dprint(80, "ENTER parse_para, tok %s", self.peektkn())
343 seq = [] 463 seq = []
344 textlist = [] 464 textlist = []
345 tok = self.peektkn() 465 tok = self.peektkn()
346 if re.match("^\s", tok['content']): 466
347 type = 'PRE' 467 if self.newline:
348 rx = re.compile("^\S") 468 if re.match("^\s", tok['content']):
469 type = 'PRE'
470 rx = re.compile("^\S")
471 else:
472 type = 'PARA'
473 rx = re.compile("^\s")
349 else: 474 else:
350 type = 'PARA' 475 type = 'SEQ'
351 rx = re.compile("^\s") 476 rx = None
477
352 while 1: 478 while 1:
353 tok = self.getkn() 479 tok = self.getkn()
354 if tok['type'] == 'TEXT': 480 if tok['type'] == 'TEXT':
355 if self.newline and rx.match(tok['content']): 481 if rx and self.newline and rx.match(tok['content']):
356 self.ungetkn() 482 self.ungetkn()
357 break 483 break
358 textlist.append(tok['content']) 484 textlist.append(tok['content'])
359 elif tok['type'] == 'NL': 485 elif tok['type'] == 'NL':
360 tok = self.getkn() 486 tok = self.getkn()
361 if tok['type'] == 'NL' or tok['type'] == 'NIL': 487 if tok['type'] == 'NL' or tok['type'] == 'NIL':
@@ -364,16 +490,20 @@ class BaseWikiMarkup(object):
364 self.ungetkn() 490 self.ungetkn()
365 if self.is_block_delim(tok): 491 if self.is_block_delim(tok):
366 break 492 break
367 textlist.append('\n') 493 textlist.append('\n')
368 elif tok['type'] == 'NIL': 494 elif tok['type'] == 'NIL':
369 break 495 break
496 elif tok['type'] == 'OTAG' or tok['type'] == 'CTAG' or tok['type'] == 'TAG':
497 self.ungetkn()
498 break
370 elif tok['type'] == 'DELIM': 499 elif tok['type'] == 'DELIM':
371 if self.is_inline_delim(tok): 500 if self.is_inline_delim(tok):
372 if textlist: 501 if textlist:
373 seq.append({ 'type': 'TEXT', 'content': textlist }) 502 seq.append({ 'type': 'TEXT',
503 'content': ''.join(textlist) })
374 textlist = [] 504 textlist = []
375 x = self.parse_inline(tok) 505 x = self.parse_inline(tok)
376 if x: 506 if x:
377 seq.append(x) 507 seq.append(x)
378 else: 508 else:
379 self.dprint(80, "ROLLBACK parse_para=%s", tok) 509 self.dprint(80, "ROLLBACK parse_para=%s", tok)
@@ -394,13 +524,13 @@ class BaseWikiMarkup(object):
394 break 524 break
395 else: 525 else:
396 seq.append({ 'type': 'TEXT', 'content': tok['content'] }) 526 seq.append({ 'type': 'TEXT', 'content': tok['content'] })
397 # self.ungetkn() 527 # self.ungetkn()
398 break 528 break
399 if textlist: 529 if textlist:
400 seq.append({ 'type': 'TEXT', 'content': textlist }) 530 seq.append({ 'type': 'TEXT', 'content': ''.join(textlist) })
401 self.dprint(80, "LEAVE parse_para=%s", seq) 531 self.dprint(80, "LEAVE parse_para=%s", seq)
402 return { 'type': type, 'content': seq } 532 return { 'type': type, 'content': seq }
403 533
404 def parse_header(self, delim): 534 def parse_header(self, delim):
405 self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn()) 535 self.dprint(80, "ENTER parse_header(%s), tok %s", delim, self.peektkn())
406 list = [] 536 list = []
@@ -440,107 +570,158 @@ class BaseWikiMarkup(object):
440 while 1: 570 while 1:
441 tok = self.getkn() 571 tok = self.getkn()
442 if tok['type'] == 'NL' or tok['type'] == 'NIL': 572 if tok['type'] == 'NL' or tok['type'] == 'NIL':
443 break 573 break
444 elif tok['type'] == 'TEXT': 574 elif tok['type'] == 'TEXT':
445 list.append(tok) 575 list.append(tok)
446 elif tok['type'] == 'DELIM' and tok['content'][0] == ":": 576 elif tok['type'] == 'DELIM':
447 list.append(self.parse_indent(len(tok['content']))) 577 if tok['content'][0] == ":":
448 break 578 list.append(self.parse_indent(len(tok['content'])))
449 else: 579 break
450 x = self.parse_inline(tok)
451 if x:
452 list.append(x)
453 else: 580 else:
454 list.append(tok) 581 x = self.parse_inline(tok)
582 if x:
583 list.append(x)
584 else:
585 list.append(tok)
586 else:
587 list.append(tok)
455 self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list) 588 self.dprint(80, "LEAVE parse_line=(SEQ, %s)", list)
456 return { 'type': 'SEQ', 'content': list } 589 return { 'type': 'SEQ', 'content': list }
457 590
458 def parse_env(self, type, lev): 591 def parse_env(self, type, lev):
459 self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn()) 592 self.dprint(80, "ENTER parse_env(%s,%s), tok %s",type,lev,self.peektkn())
460 list = [] 593 list = []
461 while 1: 594 while 1:
462 tok = self.getkn() 595 tok = self.getkn()
463 if tok['type'] == 'DELIM' \ 596 if tok['type'] == 'DELIM' \
464 and envtypes.has_key(tok['content'][0]) \ 597 and tok['content'][0] in envtypes \
465 and type == envtypes[tok['content'][0]][0]: 598 and type == envtypes[tok['content'][0]][0]:
466 if len(tok['content']) < lev: 599 if len(tok['content']) < lev:
467 self.ungetkn() 600 self.ungetkn()
468 break 601 break
469 elif len(tok['content']) > lev: 602 elif len(tok['content']) > lev:
470 self.ungetkn() 603 self.ungetkn()
471 elt = self.parse_env(type, len(tok['content'])) 604 elt = self.parse_env(type, len(tok['content']))
472 else: 605 else:
473 elt = self.parse_line() 606 elt = self.parse_line()
474 if len(tok.keys()) == 2: 607 if 'continuation' not in tok:
475 list.append({ 'type': 'ELT', 608 list.append({ 'type': 'ELT',
476 'subtype': envtypes[tok['content'][0]][1], 609 'subtype': envtypes[tok['content'][0]][1],
477 'content': elt }) 610 'content': elt })
478 continue 611 continue
479 612
480 if list[-1]['content']['type'] != 'SEQ': 613 if list:
481 x = list[-1]['content']['content'] 614 if list[-1]['content']['type'] != 'SEQ':
482 # FIXME: 615 x = list[-1]['content']['content']
483 list[-1]['content'] = { 'type': 'SEQ', 'content': [x] } 616 # FIXME:
484 list[-1]['content']['content'].append(elt) 617 list[-1]['content'] = { 'type': 'SEQ', 'content': [x] }
618 list[-1]['content']['content'].append(elt)
485 else: 619 else:
486 self.ungetkn() 620 self.ungetkn()
487 break 621 break
488 self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list) 622 self.dprint(80, "LEAVE parse_env=(ENV, %s, %s, %s)", type, lev, list)
489 return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list } 623 return { 'type': 'ENV', 'envtype': type, 'level': lev, 'content': list }
490 624
491 def parse_indent(self, lev): 625 def parse_indent(self, lev):
492 self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn()) 626 self.dprint(80, "ENTER parse_indent(%s), tok %s", lev, self.peektkn())
493 x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() } 627 x = { 'type': 'IND', 'level': lev, 'content': self.parse_line() }
494 self.dprint(80, "LEAVE parse_indent=%s", x) 628 self.dprint(80, "LEAVE parse_indent=%s", x)
495 return x 629 return x
496 630
631 def parse_til(self, tag):
632 self.dprint(80, "ENTER parse_til(%s)", tag)
633 seq = []
634 save = self.tokind
635 while 1:
636 t = self.parse0()
637 if t == None or t['type'] == 'NIL':
638 self.tokind = save
639 s = '<' + tag['tag']
640 if 'args' in tag and tag['args']:
641 s += ' ' + tag['args']
642 del tag['args']
643 s += '>'
644 if 'content' in tag:
645 subtree = tag['content']
646 else:
647 subtree = None
648 tag['type'] = 'TEXT'
649 tag['content'] = s
650 if subtree:
651 self.tree[self.tokind:self.tokind] = subtree
652 self.dprint(80, "LEAVE parse_til = %s (tree modified)", tag)
653 self.ungetkn()
654 return self.parse0()
655
656 if t['type'] == 'CTAG' and tag['tag'] == t['tag']:
657 break
658 seq.append(t)
659
660 ret = { 'type': 'TAG',
661 'tag': tag['tag'],
662 'args': tag['args'],
663 'content': { 'type': 'SEQ', 'content': seq } }
664 self.dprint(80, "LEAVE parse_til = %s", ret)
665 return ret
666
497 def parse0(self): 667 def parse0(self):
498 tok = self.getkn() 668 tok = self.getkn()
669 self.dprint(80, "parse0: %s", tok)
499 toktype = tok['type'] 670 toktype = tok['type']
500 if toktype == 'NIL': 671 if toktype == 'NIL':
501 return None 672 return None
502 elif toktype == 'TEXT': 673 elif toktype == 'TEXT':
503 self.ungetkn() 674 self.ungetkn()
504 return self.parse_para() 675 return self.parse_para()
505 elif toktype == 'DELIM': 676 elif toktype == 'DELIM':
506 if tok['content'] == "----": 677 if tok['content'] == "----":
507 return { 'type': 'BAR' } 678 return { 'type': 'BAR' }
508 elif tok['content'][0:2] == "==": 679 elif tok['content'][0:2] == "==":
509 return self.parse_header(tok['content']) 680 return self.parse_header(tok['content'])
510 elif envtypes.has_key(tok['content'][0]): 681 elif tok['content'][0] in envtypes:
511 type = envtypes[tok['content'][0]][0] 682 type = envtypes[tok['content'][0]][0]
512 lev = len(tok['content']) 683 lev = len(tok['content'])
513 self.ungetkn() 684 self.ungetkn()
514 return self.parse_env(type, lev) 685 return self.parse_env(type, lev)
515 elif tok['content'][0] == ":": 686 elif tok['content'][0] == ":":
516 return self.parse_indent(len(tok['content'])) 687 return self.parse_indent(len(tok['content']))
517 else: 688 else:
518 self.ungetkn() 689 self.ungetkn()
519 return self.parse_para() 690 return self.parse_para()
520 elif toktype == 'NL': 691 elif toktype == 'NL':
521 return { 'type': 'TEXT', 'content': '\n' } 692 return { 'type': 'TEXT', 'content': '\n' }
522# return self.parse0() 693 elif toktype == 'OTAG':
694 return self.parse_til(tok)
523 else: 695 else:
524 return tok 696 return tok
525 697
526 def parse(self): 698 def parse(self):
527 if not self.toklist: 699 if not self.toklist:
528 self.tokenize() 700 self.tokenize()
529 self.dprint(90, "TOKLIST: %s", self.toklist) 701 if self.debug_level >= 90:
702 print("TOKEN DUMP BEGIN")
703 self.dump(self.toklist)
704 print("TOKEN DUMP END")
705
530 self.tokind = 0 706 self.tokind = 0
531 self.tree = [] 707 self.tree = []
532 while 1: 708 while 1:
533 subtree = self.parse0() 709 subtree = self.parse0()
534 if subtree == None: 710 if subtree == None:
535 break 711 break
536 self.tree.append(subtree) 712 self.tree.append(subtree)
713
537 if self.nested: 714 if self.nested:
538 if self.tree[0]['type'] == 'PARA': 715 if self.tree[0]['type'] == 'PARA':
539 self.tree[0]['type'] = 'SEQ' 716 self.tree[0]['type'] = 'SEQ'
540 self.dprint(70, "TREE: %s", self.tree) 717
718 if self.debug_level >= 70:
719 print("TREE DUMP BEGIN")
720 self.dump(self.tree)
721 print("TREE DUMP END")
541 722
542 def __str__(self): 723 def __str__(self):
543 return str(self.tree) 724 return str(self.tree)
544 725
545 726
546class WikiMarkup (BaseWikiMarkup): 727class WikiMarkup (BaseWikiMarkup):
@@ -616,17 +797,13 @@ class WikiMarkup (BaseWikiMarkup):
616 if m: # and m.group(1) in self.langtab: 797 if m: # and m.group(1) in self.langtab:
617 return True 798 return True
618 return False 799 return False
619 800
620 def is_empty_text(self, elt): 801 def is_empty_text(self, elt):
621 if elt['type'] == 'TEXT': 802 if elt['type'] == 'TEXT':
622 if isinstance(elt['content'],list): 803 if re.search('\w', elt['content']):
623 for s in elt['content']:
624 if re.search('\w', s):
625 return False
626 elif re.search('\w', elt['content']):
627 return False 804 return False
628 return True 805 return True
629 return False 806 return False
630 807
631 def is_empty_para(self, seq): 808 def is_empty_para(self, seq):
632 for x in seq: 809 for x in seq:

Return to:

Send suggestions and report system problems to the System administrator.