diff --git a/pypandoc/__init__.py b/pypandoc/__init__.py index 3fd81b3..4e6359f 100644 --- a/pypandoc/__init__.py +++ b/pypandoc/__init__.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import, print_function, with_statement from typing import Iterable from typing import Union from typing import Generator @@ -13,10 +12,12 @@ import textwrap import glob from pathlib import Path +from urllib.parse import urlparse +from urllib.request import url2pathname from .handler import _check_log_handler from .pandoc_download import DEFAULT_TARGET_FOLDER, download_pandoc -from .py3compat import cast_bytes, cast_unicode, string_types, url2path, urlparse +from .py3compat import _DEFAULT_ENCODING __author__ = u'Juho Vepsäläinen' __author_email__ = "bebraw@gmail.com" @@ -53,6 +54,11 @@ # Set up the module level logger logger = logging.getLogger(__name__) +def url2path(url): # noqa: E303 + # from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url + return url2pathname(urlparse(url).path) + + def convert_text(source:str, to:str, format:str, extra_args:Iterable=(), encoding:str='utf-8', outputfile:Union[None, str, Path]=None, filters:Union[Iterable, None]=None, verify_format:bool=True, sandbox:bool=True, cworkdir:Union[str, None]=None) -> str: @@ -238,7 +244,10 @@ def _as_unicode(source:any, encoding:str) -> any: # if a source and a different encoding is given, try to decode the the source into a # unicode string try: - source = cast_unicode(source, encoding=encoding) + if isinstance(source, bytes): + encoding = encoding or _DEFAULT_ENCODING + source = source.decode(encoding) + except (UnicodeDecodeError, UnicodeEncodeError): pass return source @@ -356,7 +365,7 @@ def _convert_input(source, format, input_type, to, extra_args=(), # adds the proper filter syntax for each item in the filters list if filters is not None: - if isinstance(filters, string_types): + if isinstance(filters, str): filters = filters.split() f = ['--lua-filter=' + x if x.endswith(".lua") else '--filter=' + x for x in filters] args.extend(f) @@ -392,7 +401,8 @@ def _convert_input(source, format, input_type, to, extra_args=(), if string_input: try: - source = cast_bytes(source, encoding='utf-8') + if not isinstance(source, bytes): + source = source.encode('utf-8') except (UnicodeDecodeError, UnicodeEncodeError): # assume that it is already a utf-8 encoded string pass diff --git a/pypandoc/pandoc_download.py b/pypandoc/pandoc_download.py index dc09301..919a9b0 100644 --- a/pypandoc/pandoc_download.py +++ b/pypandoc/pandoc_download.py @@ -9,13 +9,9 @@ import subprocess import sys import tempfile -from typing import Union - import urllib -try: - from urllib.request import urlopen -except ImportError: - from urllib import urlopen +from typing import Union +from urllib.request import urlopen from .handler import _check_log_handler diff --git a/pypandoc/py3compat.py b/pypandoc/py3compat.py index afb5b74..d292509 100644 --- a/pypandoc/py3compat.py +++ b/pypandoc/py3compat.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement - import locale import sys @@ -16,62 +14,3 @@ pass _DEFAULT_ENCODING = _DEFAULT_ENCODING or sys.getdefaultencoding() - - -def _decode(s, encoding=None): - encoding = encoding or _DEFAULT_ENCODING - return s.decode(encoding) - - -def _encode(u, encoding=None): - encoding = encoding or _DEFAULT_ENCODING - return u.encode(encoding) - - -def cast_unicode(s, encoding=None): - if isinstance(s, bytes): - return _decode(s, encoding) - return s - - -def cast_bytes(s, encoding=None): - # bytes == str on py2.7 -> always encode on py2 - if not isinstance(s, bytes): - return _encode(s, encoding) - return s - - -if sys.version_info[0] >= 3: - PY3 = True - - string_types = (str,) - unicode_type = str - - # from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url - from urllib.parse import urljoin, urlparse - from urllib.request import pathname2url, url2pathname - - - def path2url(path): # noqa: E303 - return urljoin('file:', pathname2url(path)) - - - def url2path(url): # noqa: E303 - return url2pathname(urlparse(url).path) - -else: - PY3 = False - - string_types = (str, unicode) # noqa: F821 - unicode_type = unicode # noqa: F821 - - from urlparse import urljoin, urlparse - import urllib - - - def path2url(path): # noqa: E303 - return urljoin('file:', urllib.pathname2url(path)) - - - def url2path(url): # noqa: E303 - return urllib.url2pathname(urlparse(url).path) diff --git a/tests.py b/tests.py index 982fe66..5c8dbf1 100755 --- a/tests.py +++ b/tests.py @@ -14,9 +14,10 @@ import unittest import warnings from pathlib import Path +from urllib.parse import urljoin +from urllib.request import pathname2url import pypandoc -from pypandoc.py3compat import path2url, string_types, unicode_type @contextlib.contextmanager @@ -53,7 +54,7 @@ def closed_tempfile(suffix, text=None, dir_name=None): # Stolen from pandas def is_list_like(arg): return (hasattr(arg, '__iter__') and - not isinstance(arg, string_types)) + not isinstance(arg, str)) @contextlib.contextmanager @@ -155,7 +156,7 @@ def test_get_pandoc_formats(self): def test_get_pandoc_version(self): assert "HOME" in os.environ, "No HOME set, this will error..." version = pypandoc.get_pandoc_version() - self.assertTrue(isinstance(version, pypandoc.string_types)) + self.assertTrue(isinstance(version, str)) major = int(version.split(".")[0]) # according to http://pandoc.org/releases.html there were only two versions 0.x ... self.assertTrue(major in [0, 1, 2]) @@ -221,7 +222,9 @@ def test_basic_conversion_from_file_url(self): expected = u'some title{0}=========={0}{0}'.format(os.linesep) # this keeps the : (which should be '|' on windows but pandoc # doesn't like it - file_url = path2url(file_name) + + # from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url + file_url = urljoin('file:', pathname2url(file_name)) assert pypandoc._identify_path(file_url) received = pypandoc.convert_file(file_url, 'rst') @@ -490,12 +493,12 @@ def test_unicode_input(self): # make sure that pandoc always returns unicode and does not mishandle it expected = u'üäöîôû{0}'.format(os.linesep) written = pypandoc.convert_text(u'

üäöîôû

', 'md', format='html') - self.assertTrue(isinstance(written, unicode_type)) + self.assertTrue(isinstance(written, str)) self.assertEqualExceptForNewlineEnd(expected, written) bytes = u'

üäöîôû

'.encode("utf-8") written = pypandoc.convert_text(bytes, 'md', format='html') self.assertTrue(expected == written) - self.assertTrue(isinstance(written, unicode_type)) + self.assertTrue(isinstance(written, str)) # Only use german umlauts in the next test, as iso-8859-15 covers that expected = u'äüäö{0}'.format(os.linesep) @@ -516,7 +519,7 @@ def f(): # with the right encoding it should work... written = pypandoc.convert_text(bytes, 'md', format='html', encoding="iso-8859-15") self.assertEqualExceptForNewlineEnd(expected, written) - self.assertTrue(isinstance(written, unicode_type)) + self.assertTrue(isinstance(written, str)) def test_conversion_from_non_plain_text_file(self): with closed_tempfile('.docx') as file_name: