diff options
Diffstat (limited to 'wikitrans/wikimarkup.py')
-rw-r--r-- | wikitrans/wikimarkup.py | 35 |
1 files changed, 30 insertions, 5 deletions
diff --git a/wikitrans/wikimarkup.py b/wikitrans/wikimarkup.py index 19f69e6..1e2429f 100644 --- a/wikitrans/wikimarkup.py +++ b/wikitrans/wikimarkup.py @@ -142,12 +142,19 @@ class WikiMarkupParser(object): Abstract methods (must be overridden by the subclass): input() -- returns next physical line from the input material. Public attributes: + Input: + debug_level -- debug verbosity level (0 - no debug info, 100 - excessively + copious debug messages). Default is 0. + strict -- if True, parser will throw exception upon encountering + invalid markup tag (mostly for future use) + + Output: tree -- constructed parse tree (a subclass of WikiNode) """ delim = re.compile("^==+[ \t]*|[ \t]*==+[ \t]*$|(^----$)|^\\*+|^#+|^[;:]+|(\\[\\[)|\\[|(\\{\\{)|(\\]\\])|\\]|(\\}\\})|\\||(\\'\\'\\'?)|<") otag = re.compile("<(?P<tag>[a-zA-Z0-9_]+)(?:\s+(?P<args>[^/][^>]+))?\s*(?P<closed>/)?>") @@ -172,25 +179,26 @@ class WikiMarkupParser(object): newline = 0 tree = None tags = [ 'code', 'nowiki', 'tt', 'div', 'ref', 'references' ] debug_level = 0 + strict = False def dprint(self, lev, fmt, *argv): """If current debug level is greater than or equal to lev, print *argv according to format. """ if self.debug_level >= lev: for l in (fmt % argv).split('\n'): print("[DEBUG] %s" % l) inline_delims = [ "''", "'''", "[", "]", "[[", "]]", "{{", "}}", "|" ] token_class = {} - + def _new_node(self, **kwarg): return self.token_class[kwarg['type']](self, **kwarg) def tokread(self): """Read next token from the input. Return it as a subclass of WikiNode.""" line = None @@ -371,13 +379,13 @@ class WikiMarkupParser(object): stack.append(i) else: # Push the token on stack stack.append(i) # Redefine all non-matched tokens as TEXT for i in stack: - # FIXME + # FIXME: How to convert node to TEXT? self.toklist[i] = self._new_node(type='TEXT', content=str(self.toklist[i])) mark = [] def push_mark(self): @@ -489,13 +497,16 @@ class WikiMarkupParser(object): flush() acc['seq'].append(self.parse_tag(tok)) elif tok.type == 'DELIM': flush() acc['seq'].append(self.parse_inline_delim(tok)) else: - raise UnexpectedTokenError(tok) + if self.strict: + raise UnexpectedTokenError(tok) + # FIXME: Another possible variant of handling this case is to + # convert tok to TEXT node and append it to acc['seq'] tok = self.getkn() flush() if acc['seq']: tok = self._new_node(type=type, content=acc['seq']) else: tok = None @@ -973,12 +984,19 @@ class WikiMarkup(WikiMarkupParser): image_base=URL Base URL for images. Default is 'http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf' media_base=URL Base URL for media files. Default is 'http://www.mediawiki.org/xml/export-0.3' + + debug_level=INT + debug verbosity level (0 - no debug info, 100 - excessively + copious debug messages). Default is 0. + strict=BOOL + Strict parsing mode. Throw exceptions on syntax errors. Default + is False. """ self.token_class = { 'NIL': WikiNode, 'NL': WikiNode, 'OTAG': WikiTagNode, 'CTAG': WikiTagNode, @@ -996,28 +1014,35 @@ class WikiMarkup(WikiMarkupParser): 'BOLD': WikiSeqNode, 'ELT': WikiEltNode, 'ENV': WikiEnvNode, 'LINK': WikiSeqNode, 'HDR': WikiHdrNode } - + for kw in keywords: if kw == 'file': self.file = keywords[kw] elif kw == 'filename': self.file = open(keywords[kw]) elif kw == 'text': - self.text = keywords[kw].split("\n") + if sys.version_info[0] > 2: + self.text = keywords[kw].decode('utf-8').split("\n") + else: + self.text = keywords[kw].split("\n") elif kw == 'lang': self.lang = keywords[kw] elif kw == 'html_base': self.html_base = keywords[kw] elif kw == 'image_base': self.image_base = keywords[kw] elif kw == 'media_base': self.media_base = keywords[kw] + elif kw == 'strict': + self.strict = keywords[kw] + elif kw == 'debug_level': + self.debug_level = keywords[kw] def __del__(self): if self.file: self.file.close() def input(self): |