diff options
-rw-r--r-- | README.rst | 168 | ||||
-rw-r--r-- | WikiTrans/wiki2html.py | 4 | ||||
-rw-r--r-- | WikiTrans/wiki2texi.py | 8 | ||||
-rw-r--r-- | WikiTrans/wiki2text.py | 12 |
4 files changed, 178 insertions, 14 deletions
@@ -1,3 +1,169 @@ MediaWiki Markup Translator =========================== -FIXME +This package provides Python framework for translating WikiMedia +articles to various formats. The present version supports +conversions to plain text, HTML, and Texinfo formats. + +A command line convertor utility is included. + +Classes +======= + +class ``WikiMarkup`` +-------------------- +A base class for all translator classes. Unless you plan extending +wikitrans, you will never have to create objects of this +class. Instead, you will be using one of its derived classes. + +Constructor arguments common for all derived classes: + +filename = *name* + The file *name* is opened and used for input. +file = *fd* + An already opened file *fd* is used for input. +text = *string* + Input is taken from *string*, line by line. + +lang = *code* + Specifies language version. Default is ``en``. This variable can be + referred to as ``%(lang)s`` in the keyword arguments below. +html_base = *url* + Base URL for cross-references. Default is + ``http://%(lang)s.wiktionary.org/wiki/``. +image_base = *url* + Base URL for images. Default is + ``http://upload.wikimedia.org/wikipedia/commons/thumb/a/bf`` +media_base = *url* + Base URL for media files. Default is + ``http://www.mediawiki.org/xml/export-0.3`` + + +class ``TextWikiMarkup`` +------------------------ +Translates material in Wiki markup language to plain text. Usage:: + + from WikiTrans.wiki2text import TextWikiMarkup + + markup = TextWikiMarkup(filename='input.txt') + markup.parse() + print(str(markup)) + +Specific constructor arguments: + +width = *N* + Limit output width to *N* columns. Default is 78. +show_urls = *bool* + Whether or not to show the URLs links refer to. If *bool* is + ``True`` (the default), a URL will be displayed in parentheses next + to the link text. If ``False``, only the link text will be displayed. + +class ``TextWiktionaryMarkup`` +------------------------------ +Translate material from wiktionary to plain text form. This is +supposed to provide a wiktionary-specific form of +``TextWikiMarkup``. Currently both classes are entirely equivalent. + +class ``TexiWikiMarkup`` +------------------------ +Translate Wiki markup to Texinfo source. Usage:: + + from WikiTrans.wiki2texi import TexiWikiMarkup + + markup = TexiWikiMarkup(filename='input.txt') + markup.parse() + print(str(markup)) + +Two markup-specific keywords control the sectioning model used. + +sectioning_model = *model* + Selects the Texinfo sectioning model for the output + document. Possible values are: + + ``numbered`` + Top of document is marked with ``@top``. Headings (``=``, ``==``, + ``===``, etc) produce ``@chapter``, + ``@section``, ``@subsection``, etc. + ``unnumbered`` + Unnumbered sectioning: ``@top``, ``@unnumbered``, ``@unnumberedsec``, + ``@unnumberedsubsec``. + ``appendix`` + Sectioning suitable for appendix entries: ``@top``, ``@appendix``, + ``@appendixsec``, ``@appendixsubsec``, etc. + ``heading`` + Use heading directives to reflect sectioning: ``@majorheading``, + ``@chapheading``, ``@heading``, ``@subheading``, etc. + +sectioning_start = *n* + Shift resulting heading level by *n* positions. For example, supposing + ``sectioning_model=numbered``, ``== A ==`` will produce ``@section + A`` on output. If ``sectioning_start=1`` is also given, this + directive will produce ``@subsection A`` instead. + +class ``HtmlWikiMarkup`` +------------------------ +Translates Wiki markup to HTML. Usage:: + + from WikiTrans.wiki2html import HtmlWikiMarkup + + markup = HtmlWikiMarkup(filename='input.txt') + markup.parse() + print(str(markup)) + +Supported keywords are same as for ``WikiMarkup`` class. + +class ``HtmlWiktionaryMarkup`` +------------------------------ +Translate material from wiktionary to HTML form. This is +supposed to provide a wiktionary-specific form of +``HtmlWikiMarkup``. Currently both classes are equivalent. + +The ``wikitrans`` utility +========================= +This command line utility converts the supplied text to a selected +output format. The usage syntax is:: + + wikitrans [OPTIONS] ARG + +If ARG looks like a URL, the wiki text to be converted will be +downloaded from that URL. + +Otherwise, if the ``--base-url=URL`` option is given, ARG is treated as +the name of the page to get from the WikiMedia istallation at ``URL``. + +Otherwise, ARG is treated as the name of the file to read wiki +material from. + +Examples:: + + wikitrans text.wiki + + wikitrans --base-url http://en.wiktionary.org door + + wikitrans https://en.wiktionary.org/wiki/Special:Export/door + +Options are: + +``--version`` + Show program's version number and exit. +``-h``, ``--help`` + Show a short usage summary and exit. +``-v``, ``--verbose`` + Verbose operation. +``-I ITYPE``, ``--input-type=ITYPE`` + Set input document type. *ITYPE* is one of: ``default`` or ``wiktionary``. +``-t OTYPE``, ``--to=OTYPE``, ``--type=OTYPE`` + Set output document type (``html`` (the default), ``texi``, + ``text``, or ``dump``). +``-l LANG``, ``--lang=LANG`` + Set input document language +``-o KW=VAL``, ``--option=KW=VAL`` + Pass the keyword argument ``KW=VAL`` to the parser class construct. +``-d DEBUG``, ``--debug=DEBUG`` + Set debug level (0..100) +``-D``, ``--dump`` + Dump parse tree and exit; same as ``--type=dump``. +``-b URL``, ``--base-url=URL`` + Set base url. + + + diff --git a/WikiTrans/wiki2html.py b/WikiTrans/wiki2html.py index 00f02b5..6147642 100644 --- a/WikiTrans/wiki2html.py +++ b/WikiTrans/wiki2html.py @@ -111,8 +111,8 @@ class HtmlTextNode(HtmlSeqNode): class HtmlHdrNode(WikiHdrNode): def format(self): level = self.level - if level > 4: - level = 4 + if level > 6: + level = 6 return "<h%s>%s</h%s>\n\n" % (level, self.content.format(), level) class HtmlBarNode(WikiNode): diff --git a/WikiTrans/wiki2texi.py b/WikiTrans/wiki2texi.py index 39c70c6..7297195 100644 --- a/WikiTrans/wiki2texi.py +++ b/WikiTrans/wiki2texi.py @@ -303,14 +303,14 @@ class TexiWikiMarkup (WikiMarkup): self.token_class['LINK'] = TexiLinkNode self.token_class['REF'] = TexiRefNode - if "sectioning-model" in keywords: - val = keywords["sectioning-model"] + if "sectioning_model" in keywords: + val = keywords["sectioning_model"] if val in self.sectcomm: self.sectioning_model = val else: raise ValueError("Invalid value for sectioning model: %s" % val) - if "sectioning-start" in keywords: - val = keywords["sectioning-start"] + if "sectioning_start" in keywords: + val = keywords["sectioning_start"] if val < 0 or val > 4: raise ValueError("Invalid value for sectioning start: %s" % val) else: diff --git a/WikiTrans/wiki2text.py b/WikiTrans/wiki2text.py index dc2e003..cb3a183 100644 --- a/WikiTrans/wiki2text.py +++ b/WikiTrans/wiki2text.py @@ -104,7 +104,7 @@ class TextLinkNode(WikiSeqNode): ns = self.parser.wiki_ns_name(qual) if ns: if ns == 'NS_IMAGE': - if not self.parser.showrefs: + if not self.parser.show_urls: return "" text = "[%s: %s]" % (qual, text if text else arg) tgt = self.image_base + '/' + \ @@ -121,7 +121,7 @@ class TextLinkNode(WikiSeqNode): tgt = self.parser.mktgt(tgt) else: tgt = self.parser.mktgt(arg) - if self.parser.showrefs: + if self.parser.show_urls: return "%s (see %s) " % (text, tgt) elif not text or text == '': return arg @@ -212,7 +212,7 @@ class TextWikiMarkup (WikiMarkup): # Output width width = 78 # Do not show references. - showrefs = False + show_urls = False # Provide a minimum markup markup = True @@ -226,10 +226,8 @@ class TextWikiMarkup (WikiMarkup): super(TextWikiMarkup,self).__init__(*args, **keywords) if 'width' in keywords: self.width = keywords['width'] - if 'refs' in keywords: - self.showrefs = keywords['refs'] - if 'markup' in keywords: - self.markup = keywords['markup'] + if 'show_urls' in keywords: + self.show_urls = keywords['show_urls'] self.token_class['SEQ'] = TextSeqNode self.token_class['TEXT'] = TextTextNode self.token_class['PRE'] = TextPreNode |