From c2f0a5a1aae646401a5b2990b774c74c1adb199f Mon Sep 17 00:00:00 2001
From: mauicv <alex.athorne@seldon.io>
Date: Wed, 26 Jul 2023 16:11:14 +0100
Subject: [PATCH] Refactor torch device types out of od and into _types (#829)

* Refactor torch device types out of od and into _types

* Update types for device param throughout detect

* Update saving to account for `torch.device` option

* Remove redundant logic in _types

* Add saving test for torch device logic

* Add pydantic validation for supported PyTorch devices
---
 alibi_detect/cd/classifier.py                 |  8 ++-
 alibi_detect/cd/context_aware.py              |  8 ++-
 alibi_detect/cd/keops/learned_kernel.py       |  8 ++-
 alibi_detect/cd/keops/mmd.py                  |  8 ++-
 alibi_detect/cd/learned_kernel.py             |  8 ++-
 alibi_detect/cd/lsdd.py                       |  8 ++-
 alibi_detect/cd/lsdd_online.py                |  9 ++-
 alibi_detect/cd/mmd.py                        |  8 ++-
 alibi_detect/cd/mmd_online.py                 |  8 ++-
 alibi_detect/cd/model_uncertainty.py          | 15 ++--
 alibi_detect/cd/pytorch/classifier.py         |  8 ++-
 alibi_detect/cd/pytorch/context_aware.py      |  8 ++-
 alibi_detect/cd/pytorch/learned_kernel.py     |  8 ++-
 alibi_detect/cd/pytorch/lsdd.py               |  8 ++-
 alibi_detect/cd/pytorch/lsdd_online.py        |  8 ++-
 alibi_detect/cd/pytorch/mmd.py                |  8 ++-
 alibi_detect/cd/pytorch/mmd_online.py         |  8 ++-
 alibi_detect/cd/pytorch/preprocess.py         |  8 ++-
 alibi_detect/cd/pytorch/spot_the_diff.py      |  8 ++-
 alibi_detect/cd/spot_the_diff.py              |  8 ++-
 alibi_detect/cd/utils.py                      |  3 +-
 alibi_detect/od/_gmm.py                       | 16 ++---
 alibi_detect/od/_knn.py                       |  8 +--
 alibi_detect/od/_lof.py                       |  8 +--
 alibi_detect/od/_mahalanobis.py               | 15 ++--
 alibi_detect/od/_pca.py                       | 13 ++--
 alibi_detect/od/_svm.py                       | 10 +--
 alibi_detect/od/pytorch/base.py               |  7 +-
 alibi_detect/od/pytorch/gmm.py                | 11 +--
 alibi_detect/od/pytorch/knn.py                |  4 +-
 alibi_detect/od/pytorch/lof.py                |  4 +-
 alibi_detect/od/pytorch/mahalanobis.py        | 10 +--
 alibi_detect/od/pytorch/pca.py                | 25 ++++---
 alibi_detect/od/pytorch/svm.py                | 26 ++++---
 alibi_detect/saving/_pytorch/__init__.py      |  4 +-
 alibi_detect/saving/_pytorch/saving.py        | 18 +++++
 .../saving/_pytorch/tests/test_saving_pt.py   | 11 +++
 alibi_detect/saving/saving.py                 |  7 +-
 alibi_detect/saving/schemas.py                | 69 +++++++++++++------
 alibi_detect/saving/tests/test_saving.py      | 34 +++++++++
 alibi_detect/tests/test_dep_management.py     |  3 +-
 alibi_detect/utils/_types.py                  | 11 ++-
 alibi_detect/utils/pytorch/misc.py            |  7 +-
 alibi_detect/utils/pytorch/prediction.py      | 17 +++--
 44 files changed, 321 insertions(+), 188 deletions(-)

diff --git a/alibi_detect/cd/classifier.py b/alibi_detect/cd/classifier.py
index 263ddd20a..2113f1307 100644
--- a/alibi_detect/cd/classifier.py
+++ b/alibi_detect/cd/classifier.py
@@ -3,6 +3,7 @@
 from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow, \
     BackendValidator, Framework
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
 
 
 from sklearn.base import ClassifierMixin
@@ -43,7 +44,7 @@ def __init__(
             epochs: int = 3,
             verbose: int = 0,
             train_kwargs: Optional[dict] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             dataset: Optional[Callable] = None,
             dataloader: Optional[Callable] = None,
             input_shape: Optional[tuple] = None,
@@ -122,8 +123,9 @@ def __init__(
             Optional additional kwargs when fitting the classifier. Only relevant for 'tensorflow' and
             'pytorch' backends.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         dataset
             Dataset object used during training. Only relevant for 'tensorflow' and 'pytorch' backends.
         dataloader
diff --git a/alibi_detect/cd/context_aware.py b/alibi_detect/cd/context_aware.py
index 039758dd2..22089799b 100644
--- a/alibi_detect/cd/context_aware.py
+++ b/alibi_detect/cd/context_aware.py
@@ -4,6 +4,7 @@
 from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow, BackendValidator, Framework
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
 
 if has_pytorch:
     from alibi_detect.cd.pytorch.context_aware import ContextMMDDriftTorch
@@ -32,7 +33,7 @@ def __init__(
             prop_c_held: float = 0.25,
             n_folds: int = 5,
             batch_size: Optional[int] = 256,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None,
             verbose: bool = False
@@ -77,8 +78,9 @@ def __init__(
         batch_size
             If not None, then compute batches of MMDs at a time (rather than all at once).
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         input_shape
             Shape of input data.
         data_type
diff --git a/alibi_detect/cd/keops/learned_kernel.py b/alibi_detect/cd/keops/learned_kernel.py
index 0c722dd45..a112113a9 100644
--- a/alibi_detect/cd/keops/learned_kernel.py
+++ b/alibi_detect/cd/keops/learned_kernel.py
@@ -11,6 +11,7 @@
 from alibi_detect.utils.pytorch import get_device, predict_batch
 from alibi_detect.utils.pytorch.data import TorchDataset
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class LearnedKernelDriftKeops(BaseLearnedKernelDrift):
@@ -38,7 +39,7 @@ def __init__(
             num_workers: int = 0,
             verbose: int = 0,
             train_kwargs: Optional[dict] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             dataset: Callable = TorchDataset,
             dataloader: Callable = DataLoader,
             input_shape: Optional[tuple] = None,
@@ -108,8 +109,9 @@ def __init__(
         train_kwargs
             Optional additional kwargs when training the kernel.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Relevant for 'pytorch' and 'keops' backends.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Relevant for 'pytorch' and 'keops' backends.
         dataset
             Dataset object used during training.
         dataloader
diff --git a/alibi_detect/cd/keops/mmd.py b/alibi_detect/cd/keops/mmd.py
index 7f53d0470..83ca30a51 100644
--- a/alibi_detect/cd/keops/mmd.py
+++ b/alibi_detect/cd/keops/mmd.py
@@ -7,6 +7,7 @@
 from alibi_detect.utils.keops.kernels import GaussianRBF
 from alibi_detect.utils.pytorch import get_device
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 logger = logging.getLogger(__name__)
 
@@ -25,7 +26,7 @@ def __init__(
             configure_kernel_from_x_ref: bool = True,
             n_permutations: int = 100,
             batch_size_permutations: int = 1000000,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
     ) -> None:
@@ -63,8 +64,9 @@ def __init__(
         batch_size_permutations
             KeOps computes the n_permutations of the MMD^2 statistics in chunks of batch_size_permutations.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         input_shape
             Shape of input data.
         data_type
diff --git a/alibi_detect/cd/learned_kernel.py b/alibi_detect/cd/learned_kernel.py
index 4fbd631ed..01bc8f6c5 100644
--- a/alibi_detect/cd/learned_kernel.py
+++ b/alibi_detect/cd/learned_kernel.py
@@ -3,6 +3,7 @@
 from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow, has_keops, BackendValidator, Framework
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
 
 if has_pytorch:
     from torch.utils.data import DataLoader
@@ -44,7 +45,7 @@ def __init__(
             num_workers: int = 0,
             verbose: int = 0,
             train_kwargs: Optional[dict] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             dataset: Optional[Callable] = None,
             dataloader: Optional[Callable] = None,
             input_shape: Optional[tuple] = None,
@@ -117,8 +118,9 @@ def __init__(
         train_kwargs
             Optional additional kwargs when training the kernel.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Relevant for 'pytorch' and 'keops' backends.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Relevant for 'pytorch' and 'keops' backends.
         dataset
             Dataset object used during training.
         dataloader
diff --git a/alibi_detect/cd/lsdd.py b/alibi_detect/cd/lsdd.py
index 5f5423d42..31731f34f 100644
--- a/alibi_detect/cd/lsdd.py
+++ b/alibi_detect/cd/lsdd.py
@@ -3,6 +3,7 @@
 from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow, BackendValidator, Framework
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
 
 if has_pytorch:
     from alibi_detect.cd.pytorch.lsdd import LSDDDriftTorch
@@ -26,7 +27,7 @@ def __init__(
             n_permutations: int = 100,
             n_kernel_centers: Optional[int] = None,
             lambda_rd_max: float = 0.2,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
     ) -> None:
@@ -68,8 +69,9 @@ def __init__(
             The maximum relative difference between two estimates of LSDD that the regularization parameter
             lambda is allowed to cause. Defaults to 0.2 as in the paper.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         input_shape
             Shape of input data.
         data_type
diff --git a/alibi_detect/cd/lsdd_online.py b/alibi_detect/cd/lsdd_online.py
index 9a68e2101..d707e12d8 100644
--- a/alibi_detect/cd/lsdd_online.py
+++ b/alibi_detect/cd/lsdd_online.py
@@ -3,6 +3,8 @@
 from typing import Any, Callable, Dict, Optional, Union
 from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow, BackendValidator, Framework
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
+
 if has_pytorch:
     from alibi_detect.cd.pytorch.lsdd_online import LSDDDriftOnlineTorch
 
@@ -23,7 +25,7 @@ def __init__(
             n_bootstraps: int = 1000,
             n_kernel_centers: Optional[int] = None,
             lambda_rd_max: float = 0.2,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             verbose: bool = True,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
@@ -68,8 +70,9 @@ def __init__(
             The maximum relative difference between two estimates of LSDD that the regularization parameter
             lambda is allowed to cause. Defaults to 0.2 as in the paper.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         verbose
             Whether or not to print progress during configuration.
         input_shape
diff --git a/alibi_detect/cd/mmd.py b/alibi_detect/cd/mmd.py
index 3bf2f42b4..779e2e20f 100644
--- a/alibi_detect/cd/mmd.py
+++ b/alibi_detect/cd/mmd.py
@@ -4,6 +4,7 @@
 from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow, has_keops, BackendValidator, Framework
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
 
 if has_pytorch:
     from alibi_detect.cd.pytorch.mmd import MMDDriftTorch
@@ -33,7 +34,7 @@ def __init__(
             configure_kernel_from_x_ref: bool = True,
             n_permutations: int = 100,
             batch_size_permutations: int = 1000000,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
     ) -> None:
@@ -74,8 +75,9 @@ def __init__(
             KeOps computes the n_permutations of the MMD^2 statistics in chunks of batch_size_permutations.
             Only relevant for 'keops' backend.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         input_shape
             Shape of input data.
         data_type
diff --git a/alibi_detect/cd/mmd_online.py b/alibi_detect/cd/mmd_online.py
index 30d46da9a..45b06ac34 100644
--- a/alibi_detect/cd/mmd_online.py
+++ b/alibi_detect/cd/mmd_online.py
@@ -3,6 +3,7 @@
 from typing import Any, Callable, Dict, Optional, Union
 from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow, BackendValidator, Framework
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
 
 if has_pytorch:
     from alibi_detect.cd.pytorch.mmd_online import MMDDriftOnlineTorch
@@ -23,7 +24,7 @@ def __init__(
             kernel: Optional[Callable] = None,
             sigma: Optional[np.ndarray] = None,
             n_bootstraps: int = 1000,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             verbose: bool = True,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
@@ -61,8 +62,9 @@ def __init__(
             more accurately the desired ERT will be targeted. Should ideally be at least an order of magnitude
             larger than the ERT.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         verbose
             Whether or not to print progress during configuration.
         input_shape
diff --git a/alibi_detect/cd/model_uncertainty.py b/alibi_detect/cd/model_uncertainty.py
index 53fb3e339..77224185d 100644
--- a/alibi_detect/cd/model_uncertainty.py
+++ b/alibi_detect/cd/model_uncertainty.py
@@ -8,6 +8,7 @@
 from alibi_detect.cd.utils import encompass_batching, encompass_shuffling_and_batch_filling
 from alibi_detect.utils.frameworks import BackendValidator, Framework
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
 
 logger = logging.getLogger(__name__)
 
@@ -26,7 +27,7 @@ def __init__(
             margin_width: float = 0.1,
             batch_size: int = 32,
             preprocess_batch_fn: Optional[Callable] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             tokenizer: Optional[Callable] = None,
             max_len: Optional[int] = None,
             input_shape: Optional[tuple] = None,
@@ -69,8 +70,9 @@ def __init__(
             Optional batch preprocessing function. For example to convert a list of objects to a batch which can be
             processed by the model.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         tokenizer
             Optional tokenizer for NLP models.
         max_len
@@ -179,7 +181,7 @@ def __init__(
             n_evals: int = 25,
             batch_size: int = 32,
             preprocess_batch_fn: Optional[Callable] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             tokenizer: Optional[Callable] = None,
             max_len: Optional[int] = None,
             input_shape: Optional[tuple] = None,
@@ -222,8 +224,9 @@ def __init__(
             Optional batch preprocessing function. For example to convert a list of objects to a batch which can be
             processed by the model.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         tokenizer
             Optional tokenizer for NLP models.
         max_len
diff --git a/alibi_detect/cd/pytorch/classifier.py b/alibi_detect/cd/pytorch/classifier.py
index 1144f6dfd..373e6180b 100644
--- a/alibi_detect/cd/pytorch/classifier.py
+++ b/alibi_detect/cd/pytorch/classifier.py
@@ -13,6 +13,7 @@
 from alibi_detect.utils.pytorch.prediction import predict_batch
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class ClassifierDriftTorch(BaseClassifierDrift):
@@ -40,7 +41,7 @@ def __init__(
             epochs: int = 3,
             verbose: int = 0,
             train_kwargs: Optional[dict] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             dataset: Callable = TorchDataset,
             dataloader: Callable = DataLoader,
             input_shape: Optional[tuple] = None,
@@ -108,8 +109,9 @@ def __init__(
         train_kwargs
             Optional additional kwargs when fitting the classifier.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         dataset
             Dataset object used during training.
         dataloader
diff --git a/alibi_detect/cd/pytorch/context_aware.py b/alibi_detect/cd/pytorch/context_aware.py
index 25c796f0a..d1b45b03d 100644
--- a/alibi_detect/cd/pytorch/context_aware.py
+++ b/alibi_detect/cd/pytorch/context_aware.py
@@ -8,6 +8,7 @@
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.utils.frameworks import Framework
 from alibi_detect.cd._domain_clf import _SVCDomainClf
+from alibi_detect.utils._types import TorchDeviceType
 from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
@@ -32,7 +33,7 @@ def __init__(
             prop_c_held: float = 0.25,
             n_folds: int = 5,
             batch_size: Optional[int] = 256,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None,
             verbose: bool = False,
@@ -75,8 +76,9 @@ def __init__(
         batch_size
             If not None, then compute batches of MMDs at a time (rather than all at once).
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         input_shape
             Shape of input data.
         data_type
diff --git a/alibi_detect/cd/pytorch/learned_kernel.py b/alibi_detect/cd/pytorch/learned_kernel.py
index 3676300a3..a8785efe7 100644
--- a/alibi_detect/cd/pytorch/learned_kernel.py
+++ b/alibi_detect/cd/pytorch/learned_kernel.py
@@ -12,6 +12,7 @@
 from alibi_detect.utils.pytorch.data import TorchDataset
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class LearnedKernelDriftTorch(BaseLearnedKernelDrift):
@@ -39,7 +40,7 @@ def __init__(
             num_workers: int = 0,
             verbose: int = 0,
             train_kwargs: Optional[dict] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             dataset: Callable = TorchDataset,
             dataloader: Callable = DataLoader,
             input_shape: Optional[tuple] = None,
@@ -108,8 +109,9 @@ def __init__(
         train_kwargs
             Optional additional kwargs when training the kernel.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         dataset
             Dataset object used during training.
         dataloader
diff --git a/alibi_detect/cd/pytorch/lsdd.py b/alibi_detect/cd/pytorch/lsdd.py
index 53ef19182..37e4dd81a 100644
--- a/alibi_detect/cd/pytorch/lsdd.py
+++ b/alibi_detect/cd/pytorch/lsdd.py
@@ -7,6 +7,7 @@
 from alibi_detect.utils.pytorch.distance import permed_lsdds
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class LSDDDriftTorch(BaseLSDDDrift):
@@ -23,7 +24,7 @@ def __init__(
             n_permutations: int = 100,
             n_kernel_centers: Optional[int] = None,
             lambda_rd_max: float = 0.2,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
     ) -> None:
@@ -63,8 +64,9 @@ def __init__(
             The maximum relative difference between two estimates of LSDD that the regularization parameter
             lambda is allowed to cause. Defaults to 0.2 as in the paper.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         input_shape
             Shape of input data.
         data_type
diff --git a/alibi_detect/cd/pytorch/lsdd_online.py b/alibi_detect/cd/pytorch/lsdd_online.py
index 3b97e33a3..0dbed9406 100644
--- a/alibi_detect/cd/pytorch/lsdd_online.py
+++ b/alibi_detect/cd/pytorch/lsdd_online.py
@@ -6,6 +6,7 @@
 from alibi_detect.utils.pytorch import get_device
 from alibi_detect.utils.pytorch import GaussianRBF, permed_lsdds, quantile
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class LSDDDriftOnlineTorch(BaseMultiDriftOnline):
@@ -22,7 +23,7 @@ def __init__(
             n_bootstraps: int = 1000,
             n_kernel_centers: Optional[int] = None,
             lambda_rd_max: float = 0.2,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             verbose: bool = True,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
@@ -65,8 +66,9 @@ def __init__(
             The maximum relative difference between two estimates of LSDD that the regularization parameter
             lambda is allowed to cause. Defaults to 0.2 as in the paper.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         verbose
             Whether or not to print progress during configuration.
         input_shape
diff --git a/alibi_detect/cd/pytorch/mmd.py b/alibi_detect/cd/pytorch/mmd.py
index 4c3c87f8d..ffacad8e6 100644
--- a/alibi_detect/cd/pytorch/mmd.py
+++ b/alibi_detect/cd/pytorch/mmd.py
@@ -8,6 +8,7 @@
 from alibi_detect.utils.pytorch.kernels import GaussianRBF
 from alibi_detect.utils.warnings import deprecated_alias
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 logger = logging.getLogger(__name__)
 
@@ -26,7 +27,7 @@ def __init__(
             sigma: Optional[np.ndarray] = None,
             configure_kernel_from_x_ref: bool = True,
             n_permutations: int = 100,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
     ) -> None:
@@ -62,8 +63,9 @@ def __init__(
         n_permutations
             Number of permutations used in the permutation test.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         input_shape
             Shape of input data.
         data_type
diff --git a/alibi_detect/cd/pytorch/mmd_online.py b/alibi_detect/cd/pytorch/mmd_online.py
index 76ef33c06..10895a83e 100644
--- a/alibi_detect/cd/pytorch/mmd_online.py
+++ b/alibi_detect/cd/pytorch/mmd_online.py
@@ -7,6 +7,7 @@
 from alibi_detect.utils.pytorch.kernels import GaussianRBF
 from alibi_detect.utils.pytorch import zero_diag, quantile
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class MMDDriftOnlineTorch(BaseMultiDriftOnline):
@@ -22,7 +23,7 @@ def __init__(
             kernel: Callable = GaussianRBF,
             sigma: Optional[np.ndarray] = None,
             n_bootstraps: int = 1000,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             verbose: bool = True,
             input_shape: Optional[tuple] = None,
             data_type: Optional[str] = None
@@ -58,8 +59,9 @@ def __init__(
             more accurately the desired ERT will be targeted. Should ideally be at least an order of magnitude
             larger than the ERT.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         verbose
             Whether or not to print progress during configuration.
         input_shape
diff --git a/alibi_detect/cd/pytorch/preprocess.py b/alibi_detect/cd/pytorch/preprocess.py
index 2ba5448d2..1e87ac5ce 100644
--- a/alibi_detect/cd/pytorch/preprocess.py
+++ b/alibi_detect/cd/pytorch/preprocess.py
@@ -5,6 +5,7 @@
 import torch.nn as nn
 from alibi_detect.utils.pytorch.prediction import (predict_batch,
                                                    predict_batch_transformer)
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class _Encoder(nn.Module):
@@ -82,7 +83,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 def preprocess_drift(x: Union[np.ndarray, list], model: Union[nn.Module, nn.Sequential],
-                     device: Optional[torch.device] = None, preprocess_batch_fn: Callable = None,
+                     device: TorchDeviceType = None, preprocess_batch_fn: Callable = None,
                      tokenizer: Optional[Callable] = None, max_len: Optional[int] = None,
                      batch_size: int = int(1e10), dtype: Union[Type[np.generic], torch.dtype] = np.float32) \
         -> Union[np.ndarray, torch.Tensor, tuple]:
@@ -96,8 +97,9 @@ def preprocess_drift(x: Union[np.ndarray, list], model: Union[nn.Module, nn.Sequ
     model
         Model used for preprocessing.
     device
-        Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-        Can be specified by passing either torch.device('cuda') or torch.device('cpu').
+        Device type used. The default tries to use the GPU and falls back on CPU if needed.
+        Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+        ``torch.device``.
     preprocess_batch_fn
         Optional batch preprocessing function. For example to convert a list of objects to a batch which can be
         processed by the PyTorch model.
diff --git a/alibi_detect/cd/pytorch/spot_the_diff.py b/alibi_detect/cd/pytorch/spot_the_diff.py
index a9b359e9c..361800d1b 100644
--- a/alibi_detect/cd/pytorch/spot_the_diff.py
+++ b/alibi_detect/cd/pytorch/spot_the_diff.py
@@ -8,6 +8,7 @@
 from alibi_detect.utils.pytorch.data import TorchDataset
 from alibi_detect.utils.pytorch import GaussianRBF
 from alibi_detect.utils.pytorch.prediction import predict_batch
+from alibi_detect.utils._types import TorchDeviceType
 
 logger = logging.getLogger(__name__)
 
@@ -35,7 +36,7 @@ def __init__(
             epochs: int = 3,
             verbose: int = 0,
             train_kwargs: Optional[dict] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             dataset: Callable = TorchDataset,
             dataloader: Callable = DataLoader,
             input_shape: Optional[tuple] = None,
@@ -105,8 +106,9 @@ def __init__(
         train_kwargs
             Optional additional kwargs when fitting the classifier.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         dataset
             Dataset object used during training.
         dataloader
diff --git a/alibi_detect/cd/spot_the_diff.py b/alibi_detect/cd/spot_the_diff.py
index 8d0b151f2..43aaa9cdd 100644
--- a/alibi_detect/cd/spot_the_diff.py
+++ b/alibi_detect/cd/spot_the_diff.py
@@ -2,6 +2,7 @@
 from typing import Callable, Dict, Optional, Union
 from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow, BackendValidator, Framework
 from alibi_detect.base import DriftConfigMixin
+from alibi_detect.utils._types import TorchDeviceType
 
 if has_pytorch:
     from alibi_detect.cd.pytorch.spot_the_diff import SpotTheDiffDriftTorch
@@ -37,7 +38,7 @@ def __init__(
             epochs: int = 3,
             verbose: int = 0,
             train_kwargs: Optional[dict] = None,
-            device: Optional[str] = None,
+            device: TorchDeviceType = None,
             dataset: Optional[Callable] = None,
             dataloader: Optional[Callable] = None,
             input_shape: Optional[tuple] = None,
@@ -109,8 +110,9 @@ def __init__(
         train_kwargs
             Optional additional kwargs when fitting the classifier.
         device
-            Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-            Can be specified by passing either 'cuda', 'gpu' or 'cpu'. Only relevant for 'pytorch' backend.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. Only relevant for 'pytorch' backend.
         dataset
             Dataset object used during training.
         dataloader
diff --git a/alibi_detect/cd/utils.py b/alibi_detect/cd/utils.py
index aaec3b009..169d85f58 100644
--- a/alibi_detect/cd/utils.py
+++ b/alibi_detect/cd/utils.py
@@ -5,6 +5,7 @@
 import numpy as np
 from alibi_detect.utils.sampling import reservoir_sampling
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 logger = logging.getLogger(__name__)
 
@@ -51,7 +52,7 @@ def encompass_batching(
         model: Callable,
         backend: str,
         batch_size: int,
-        device: Optional[str] = None,
+        device: TorchDeviceType = None,
         preprocess_batch_fn: Optional[Callable] = None,
         tokenizer: Optional[Callable] = None,
         max_len: Optional[int] = None,
diff --git a/alibi_detect/od/_gmm.py b/alibi_detect/od/_gmm.py
index 53ee14cb2..2ceafc710 100644
--- a/alibi_detect/od/_gmm.py
+++ b/alibi_detect/od/_gmm.py
@@ -1,4 +1,4 @@
-from typing import Union, Optional, Dict, Any, TYPE_CHECKING
+from typing import Optional, Dict, Any
 
 import numpy as np
 
@@ -10,10 +10,7 @@
 from alibi_detect.utils.frameworks import BackendValidator
 from alibi_detect.version import __version__
 from alibi_detect.exceptions import _catch_error as catch_error
-
-
-if TYPE_CHECKING:
-    import torch
+from alibi_detect.utils._types import TorchDeviceType
 
 
 backends = {
@@ -27,7 +24,7 @@ def __init__(
         self,
         n_components: int = 1,
         backend: Literal['pytorch', 'sklearn'] = 'sklearn',
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ) -> None:
         """Gaussian Mixture Model (GMM) outlier detector.
 
@@ -45,9 +42,10 @@ def __init__(
         backend
             Backend used for outlier detection. Defaults to ``'sklearn'``. Options are ``'pytorch'`` and ``'sklearn'``.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'`` or ``'cpu'``. The device is only used if the ``'pytorch'`` backend is
-            used. Defaults to ``None``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``. The device is only used if the ``'pytorch'`` backend is used. Defaults
+            to ``None``.
 
         Raises
         ------
diff --git a/alibi_detect/od/_knn.py b/alibi_detect/od/_knn.py
index 7ca87b641..96da6f8d4 100644
--- a/alibi_detect/od/_knn.py
+++ b/alibi_detect/od/_knn.py
@@ -1,5 +1,4 @@
 from typing import Callable, Union, Optional, Dict, Any, List, Tuple
-from typing import TYPE_CHECKING
 from typing_extensions import Literal
 
 import numpy as np
@@ -12,10 +11,7 @@
 from alibi_detect.od.base import get_aggregator, get_normalizer, NormalizerLiterals, AggregatorLiterals
 from alibi_detect.utils.frameworks import BackendValidator
 from alibi_detect.version import __version__
-
-
-if TYPE_CHECKING:
-    import torch
+from alibi_detect.utils._types import TorchDeviceType
 
 
 backends = {
@@ -31,7 +27,7 @@ def __init__(
         normalizer: Optional[Union[TransformProtocolType, NormalizerLiterals]] = 'PValNormalizer',
         aggregator: Union[TransformProtocol, AggregatorLiterals] = 'AverageAggregator',
         backend: Literal['pytorch'] = 'pytorch',
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ) -> None:
         """
         k-Nearest Neighbors (kNN) outlier detector.
diff --git a/alibi_detect/od/_lof.py b/alibi_detect/od/_lof.py
index 671e170fe..23ac9401d 100644
--- a/alibi_detect/od/_lof.py
+++ b/alibi_detect/od/_lof.py
@@ -1,5 +1,4 @@
 from typing import Callable, Union, Optional, Dict, Any, List, Tuple
-from typing import TYPE_CHECKING
 from typing_extensions import Literal
 
 import numpy as np
@@ -12,10 +11,7 @@
 from alibi_detect.od.base import get_aggregator, get_normalizer, NormalizerLiterals, AggregatorLiterals
 from alibi_detect.utils.frameworks import BackendValidator
 from alibi_detect.version import __version__
-
-
-if TYPE_CHECKING:
-    import torch
+from alibi_detect.utils._types import TorchDeviceType
 
 
 backends = {
@@ -31,7 +27,7 @@ def __init__(
         normalizer: Optional[Union[TransformProtocolType, NormalizerLiterals]] = 'PValNormalizer',
         aggregator: Union[TransformProtocol, AggregatorLiterals] = 'AverageAggregator',
         backend: Literal['pytorch'] = 'pytorch',
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ) -> None:
         """
         Local Outlier Factor (LOF) outlier detector.
diff --git a/alibi_detect/od/_mahalanobis.py b/alibi_detect/od/_mahalanobis.py
index 717301ad7..fc86ca986 100644
--- a/alibi_detect/od/_mahalanobis.py
+++ b/alibi_detect/od/_mahalanobis.py
@@ -1,5 +1,4 @@
-from typing import Union, Optional, Dict, Any
-from typing import TYPE_CHECKING
+from typing import Dict, Any
 from alibi_detect.exceptions import _catch_error as catch_error
 from typing_extensions import Literal
 
@@ -9,10 +8,7 @@
 from alibi_detect.od.pytorch import MahalanobisTorch
 from alibi_detect.utils.frameworks import BackendValidator
 from alibi_detect.version import __version__
-
-
-if TYPE_CHECKING:
-    import torch
+from alibi_detect.utils._types import TorchDeviceType
 
 
 backends = {
@@ -25,7 +21,7 @@ def __init__(
         self,
         min_eigenvalue: float = 1e-6,
         backend: Literal['pytorch'] = 'pytorch',
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ) -> None:
         """
         The Mahalanobis outlier detection method.
@@ -50,8 +46,9 @@ def __init__(
         backend
             Backend used for outlier detection. Defaults to ``'pytorch'``. Options are ``'pytorch'``.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
 
         Raises
         ------
diff --git a/alibi_detect/od/_pca.py b/alibi_detect/od/_pca.py
index 2c86259e6..b34057d7e 100644
--- a/alibi_detect/od/_pca.py
+++ b/alibi_detect/od/_pca.py
@@ -1,5 +1,4 @@
 from typing import Union, Optional, Callable, Dict, Any
-from typing import TYPE_CHECKING
 from typing_extensions import Literal
 
 import numpy as np
@@ -10,10 +9,7 @@
 from alibi_detect.utils.frameworks import BackendValidator
 from alibi_detect.version import __version__
 from alibi_detect.exceptions import _catch_error as catch_error
-
-
-if TYPE_CHECKING:
-    import torch
+from alibi_detect.utils._types import TorchDeviceType
 
 
 backends = {
@@ -27,7 +23,7 @@ def __init__(
         n_components: int,
         kernel: Optional[Callable] = None,
         backend: Literal['pytorch'] = 'pytorch',
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ) -> None:
         """Principal Component Analysis (PCA) outlier detector.
 
@@ -54,8 +50,9 @@ def __init__(
         backend
             Backend used for outlier detection. Defaults to ``'pytorch'``. Options are ``'pytorch'``.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
 
         Raises
         ------
diff --git a/alibi_detect/od/_svm.py b/alibi_detect/od/_svm.py
index a60066ff5..845fdc272 100644
--- a/alibi_detect/od/_svm.py
+++ b/alibi_detect/od/_svm.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 
 import numpy as np
 
@@ -9,6 +9,7 @@
 from alibi_detect.utils._types import Literal
 from alibi_detect.utils.frameworks import BackendValidator
 from alibi_detect.version import __version__
+from alibi_detect.utils._types import TorchDeviceType
 
 
 if TYPE_CHECKING:
@@ -31,7 +32,7 @@ def __init__(
         kernel: 'torch.nn.Module' = None,
         optimization: Literal['sgd', 'bgd'] = 'sgd',
         backend: Literal['pytorch'] = 'pytorch',
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ) -> None:
         """One-Class Support vector machine (OCSVM) outlier detector.
 
@@ -72,8 +73,9 @@ def __init__(
         backend
             Backend used for outlier detection. Defaults to ``'pytorch'``. Options are ``'pytorch'``.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
 
         Raises
         ------
diff --git a/alibi_detect/od/pytorch/base.py b/alibi_detect/od/pytorch/base.py
index 4c5b05c5a..40f1677dd 100644
--- a/alibi_detect/od/pytorch/base.py
+++ b/alibi_detect/od/pytorch/base.py
@@ -1,5 +1,4 @@
 from typing import List, Union, Optional, Dict
-from typing_extensions import Literal
 from dataclasses import dataclass, fields
 from abc import ABC, abstractmethod
 
@@ -9,6 +8,7 @@
 from alibi_detect.od.pytorch.ensemble import FitMixinTorch
 from alibi_detect.utils.pytorch.misc import get_device
 from alibi_detect.exceptions import ThresholdNotInferredError
+from alibi_detect.utils._types import TorchDeviceType
 
 
 @dataclass
@@ -73,10 +73,7 @@ class TorchOutlierDetector(torch.nn.Module, FitMixinTorch, ABC):
     threshold_inferred = False
     threshold = None
 
-    def __init__(
-            self,
-            device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
-            ):
+    def __init__(self, device: TorchDeviceType = None):
         self.device = get_device(device)
         super().__init__()
 
diff --git a/alibi_detect/od/pytorch/gmm.py b/alibi_detect/od/pytorch/gmm.py
index 474311b9a..81330412f 100644
--- a/alibi_detect/od/pytorch/gmm.py
+++ b/alibi_detect/od/pytorch/gmm.py
@@ -1,5 +1,4 @@
-from typing import Optional, Union, Dict, Type
-from typing_extensions import Literal
+from typing import Dict, Type
 from tqdm import tqdm
 import torch
 from torch.utils.data import DataLoader
@@ -8,6 +7,7 @@
 from alibi_detect.od.pytorch.base import TorchOutlierDetector
 from alibi_detect.models.pytorch.gmm import GMMModel
 from alibi_detect.utils.pytorch.misc import get_optimizer
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class GMMTorch(TorchOutlierDetector):
@@ -16,7 +16,7 @@ class GMMTorch(TorchOutlierDetector):
     def __init__(
         self,
         n_components: int,
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ):
         """Pytorch backend for the Gaussian Mixture Model (GMM) outlier detector.
 
@@ -25,8 +25,9 @@ def __init__(
         n_components
             Number of components in gaussian mixture model.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
 
         Raises
         ------
diff --git a/alibi_detect/od/pytorch/knn.py b/alibi_detect/od/pytorch/knn.py
index 556930b3f..a5e4b7984 100644
--- a/alibi_detect/od/pytorch/knn.py
+++ b/alibi_detect/od/pytorch/knn.py
@@ -1,10 +1,10 @@
 from typing import Optional, Union, List, Tuple
-from typing_extensions import Literal
 import numpy as np
 import torch
 
 from alibi_detect.od.pytorch.ensemble import Ensembler
 from alibi_detect.od.pytorch.base import TorchOutlierDetector
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class KNNTorch(TorchOutlierDetector):
@@ -13,7 +13,7 @@ def __init__(
             k: Union[np.ndarray, List, Tuple, int],
             kernel: Optional[torch.nn.Module] = None,
             ensembler: Optional[Ensembler] = None,
-            device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+            device: TorchDeviceType = None,
             ):
         """PyTorch backend for KNN detector.
 
diff --git a/alibi_detect/od/pytorch/lof.py b/alibi_detect/od/pytorch/lof.py
index 055af2d18..6a8ec50cd 100644
--- a/alibi_detect/od/pytorch/lof.py
+++ b/alibi_detect/od/pytorch/lof.py
@@ -1,10 +1,10 @@
 from typing import Optional, Union, List, Tuple
-from typing_extensions import Literal
 import numpy as np
 import torch
 
 from alibi_detect.od.pytorch.ensemble import Ensembler
 from alibi_detect.od.pytorch.base import TorchOutlierDetector
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class LOFTorch(TorchOutlierDetector):
@@ -13,7 +13,7 @@ def __init__(
             k: Union[np.ndarray, List, Tuple, int],
             kernel: Optional[torch.nn.Module] = None,
             ensembler: Optional[Ensembler] = None,
-            device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+            device: TorchDeviceType = None,
             ):
         """PyTorch backend for LOF detector.
 
diff --git a/alibi_detect/od/pytorch/mahalanobis.py b/alibi_detect/od/pytorch/mahalanobis.py
index a51e4a9b6..ecb5aaf35 100644
--- a/alibi_detect/od/pytorch/mahalanobis.py
+++ b/alibi_detect/od/pytorch/mahalanobis.py
@@ -1,8 +1,7 @@
-from typing import Optional, Union
-from typing_extensions import Literal
 import torch
 
 from alibi_detect.od.pytorch.base import TorchOutlierDetector
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class MahalanobisTorch(TorchOutlierDetector):
@@ -11,7 +10,7 @@ class MahalanobisTorch(TorchOutlierDetector):
     def __init__(
             self,
             min_eigenvalue: float = 1e-6,
-            device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+            device: TorchDeviceType = None,
             ):
         """PyTorch backend for Mahalanobis detector.
 
@@ -20,8 +19,9 @@ def __init__(
         min_eigenvalue
             Eigenvectors with eigenvalues below this value will be discarded.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         """
         super().__init__(device=device)
         self.min_eigenvalue = min_eigenvalue
diff --git a/alibi_detect/od/pytorch/pca.py b/alibi_detect/od/pytorch/pca.py
index 23f46a200..209030bd3 100644
--- a/alibi_detect/od/pytorch/pca.py
+++ b/alibi_detect/od/pytorch/pca.py
@@ -1,9 +1,9 @@
-from typing import Optional, Union, Callable
-from typing_extensions import Literal
+from typing import Optional, Callable
 
 import torch
 
 from alibi_detect.od.pytorch.base import TorchOutlierDetector
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class PCATorch(TorchOutlierDetector):
@@ -12,7 +12,7 @@ class PCATorch(TorchOutlierDetector):
     def __init__(
             self,
             n_components: int,
-            device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+            device: TorchDeviceType = None,
             ):
         """PyTorch backend for PCA detector.
 
@@ -22,8 +22,9 @@ def __init__(
             The number of dimensions in the principal subspace. For linear PCA should have
             ``1 <= n_components < dim(data)``. For kernel pca should have ``1 <= n_components < len(data)``.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
 
         Raises
         ------
@@ -102,7 +103,7 @@ class LinearPCATorch(PCATorch):
     def __init__(
             self,
             n_components: int,
-            device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+            device: TorchDeviceType = None,
             ):
         """Linear variant of the PyTorch backend for PCA detector.
 
@@ -111,8 +112,9 @@ def __init__(
         n_components:
             The number of dimensions in the principal subspace.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         """
         super().__init__(device=device, n_components=n_components)
 
@@ -173,7 +175,7 @@ def __init__(
             self,
             n_components: int,
             kernel: Optional[Callable],
-            device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+            device: TorchDeviceType = None,
             ):
         """Kernel variant of the PyTorch backend for PCA detector.
 
@@ -184,8 +186,9 @@ def __init__(
         kernel
             Kernel function to use for outlier detection.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         """
         super().__init__(device=device, n_components=n_components)
         self.kernel = kernel
diff --git a/alibi_detect/od/pytorch/svm.py b/alibi_detect/od/pytorch/svm.py
index 081896beb..ab9adf003 100644
--- a/alibi_detect/od/pytorch/svm.py
+++ b/alibi_detect/od/pytorch/svm.py
@@ -1,16 +1,17 @@
 import warnings
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Callable, Dict, Optional, Tuple
 
 import numpy as np
 import torch
 from sklearn.linear_model import SGDOneClassSVM
 from sklearn.utils.extmath import safe_sparse_dot
 from tqdm import tqdm
-from typing_extensions import Literal, Self
+from typing_extensions import Self
 
 from alibi_detect.od.pytorch.base import TorchOutlierDetector
 from alibi_detect.utils.pytorch.losses import hinge_loss
 from alibi_detect.utils.pytorch.kernels import GaussianRBF
+from alibi_detect.utils._types import TorchDeviceType
 
 
 class SVMTorch(TorchOutlierDetector):
@@ -21,7 +22,7 @@ def __init__(
         nu: float,
         kernel: 'torch.nn.Module' = None,
         n_components: Optional[int] = None,
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ):
         """Pytorch backend for the Support Vector Machine (SVM) outlier detector.
 
@@ -36,8 +37,9 @@ def __init__(
         n_components
             Number of components in the Nystroem approximation, by default uses all of them.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         """
         super().__init__(device=device)
         self.n_components = n_components
@@ -82,7 +84,7 @@ def __init__(
         nu: float,
         kernel: 'torch.nn.Module' = None,
         n_components: Optional[int] = None,
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ):
         """SGD Optimization backend for the One class support vector machine (SVM) outlier detector.
 
@@ -97,8 +99,9 @@ def __init__(
         n_components
             Number of components in the Nystroem approximation, by default uses all of them.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         """
         if (isinstance(device, str) and device in ('gpu', 'cuda')) or \
                 (isinstance(device, torch.device) and device.type == 'cuda'):
@@ -207,7 +210,7 @@ def __init__(
         nu: float,
         kernel: 'torch.nn.Module' = None,
         n_components: Optional[int] = None,
-        device: Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']] = None,
+        device: TorchDeviceType = None,
     ):
         """Pytorch backend for the Support Vector Machine (SVM) outlier detector.
 
@@ -222,8 +225,9 @@ def __init__(
         n_components
             Number of components in the Nystroem approximation, by default uses all of them.
         device
-            Device type used. The default tries to use the GPU and falls back on CPU if needed. Can be specified by
-            passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+            Device type used. The default tries to use the GPU and falls back on CPU if needed.
+            Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+            ``torch.device``.
         """
 
         if (isinstance(device, str) and device == 'cpu') or \
diff --git a/alibi_detect/saving/_pytorch/__init__.py b/alibi_detect/saving/_pytorch/__init__.py
index 58cc7c1df..62c05e7ba 100644
--- a/alibi_detect/saving/_pytorch/__init__.py
+++ b/alibi_detect/saving/_pytorch/__init__.py
@@ -9,9 +9,9 @@
                'load_optimizer',
                'prep_model_and_emb'])
 
-save_model_config_pt = import_optional(
+save_model_config_pt, save_device_pt = import_optional(
     'alibi_detect.saving._pytorch.saving',
-    names=['save_model_config']
+    names=['save_model_config', 'save_device']
 )
 
 get_pt_dtype = import_optional(
diff --git a/alibi_detect/saving/_pytorch/saving.py b/alibi_detect/saving/_pytorch/saving.py
index e8771305c..e4d0778de 100644
--- a/alibi_detect/saving/_pytorch/saving.py
+++ b/alibi_detect/saving/_pytorch/saving.py
@@ -10,6 +10,7 @@
 from alibi_detect.cd.pytorch import UAE, HiddenOutput
 from alibi_detect.models.pytorch import TransformerEmbedding
 from alibi_detect.utils.frameworks import Framework
+from alibi_detect.utils._types import TorchDeviceType
 
 logger = logging.getLogger(__name__)
 
@@ -129,3 +130,20 @@ def save_embedding_config(embed: TransformerEmbedding,
     embed.model.save_pretrained(filepath)
 
     return cfg_embed
+
+
+def save_device(device: TorchDeviceType):
+    """
+
+    Parameters
+    ----------
+    device
+        Torch device to be serialised. Can be specified by passing either ``'cuda'``,
+        ``'gpu'``, ``'cpu'`` or an instance of ``torch.device``.
+
+    Returns
+    -------
+    a string with value ``'cuda'`` or ``'cpu'``.
+    """
+    device_str = str(device)
+    return device_str.split(':')[0]
diff --git a/alibi_detect/saving/_pytorch/tests/test_saving_pt.py b/alibi_detect/saving/_pytorch/tests/test_saving_pt.py
index de3d37fbe..dfffa2895 100644
--- a/alibi_detect/saving/_pytorch/tests/test_saving_pt.py
+++ b/alibi_detect/saving/_pytorch/tests/test_saving_pt.py
@@ -6,7 +6,9 @@
 from alibi_detect.cd.pytorch import HiddenOutput as HiddenOutput_pt
 from alibi_detect.saving.loading import _load_model_config, _load_optimizer_config
 from alibi_detect.saving.saving import _path2str, _save_model_config
+from alibi_detect.saving._pytorch.saving import save_device
 from alibi_detect.saving.schemas import ModelConfig
+import torch
 
 backend = param_fixture("backend", ['pytorch'])
 
@@ -51,3 +53,12 @@ def test_save_model_pt(data, model, layer, tmp_path):
         assert isinstance(model_load, type(model))
     else:
         assert isinstance(model_load, HiddenOutput_pt)
+
+
+@parametrize('device', ['cpu', 'gpu', 'cuda', 'cuda:0', torch.device('cuda'), torch.device('cuda:0')])
+def test_save_device_pt(device):
+    """
+    Unit test for _save_device.
+    """
+    result = save_device(device)
+    assert result in {'gpu', 'cuda', 'cpu'}
diff --git a/alibi_detect/saving/saving.py b/alibi_detect/saving/saving.py
index 766228dba..51d656c95 100644
--- a/alibi_detect/saving/saving.py
+++ b/alibi_detect/saving/saving.py
@@ -17,7 +17,7 @@
     supported_models_sklearn
 from alibi_detect.base import Detector, ConfigurableDetector, StatefulDetectorOnline
 from alibi_detect.saving._tensorflow import save_detector_legacy, save_model_config_tf, save_optimizer_config_tf
-from alibi_detect.saving._pytorch import save_model_config_pt
+from alibi_detect.saving._pytorch import save_model_config_pt, save_device_pt
 from alibi_detect.saving._sklearn import save_model_config_sk
 
 if TYPE_CHECKING:
@@ -188,6 +188,11 @@ def _save_detector_config(detector: ConfigurableDetector,
     if optimizer is not None:
         cfg['optimizer'] = _save_optimizer_config(optimizer)
 
+    # Serialize device
+    device = cfg.get('device')
+    if device is not None:
+        cfg['device'] = save_device_pt(device)
+
     # Serialize dataset
     dataset = cfg.get('dataset')
     if dataset is not None:
diff --git a/alibi_detect/saving/schemas.py b/alibi_detect/saving/schemas.py
index 0f1f597c3..74573c200 100644
--- a/alibi_detect/saving/schemas.py
+++ b/alibi_detect/saving/schemas.py
@@ -79,6 +79,33 @@ def validate_optimizer(cls, optimizer: Any, values: dict) -> Any:
 #  of preprocess_drift.
 
 
+class SupportedDevice:
+    """
+    Pydantic custom type to check the device is correct for the choice of backend (conditional on what optional deps
+    are installed).
+    """
+    @classmethod
+    def __get_validators__(cls):
+        yield cls.validate_device
+
+    @classmethod
+    def validate_device(cls, device: Any, values: dict) -> Any:
+        backend = values['backend']
+        if backend == Framework.TENSORFLOW or backend == Framework.SKLEARN:
+            if device is not None:
+                raise TypeError('`device` should not be specified for TensorFlow or Sklearn backends. Leave as `None`.')
+            else:
+                return device
+        elif backend == Framework.PYTORCH or backend == Framework.KEOPS:
+            device_str = str(device).split(':')[0]
+            if device_str not in ['cpu', 'cuda', 'gpu']:
+                raise TypeError(f'`device` should be one of `cpu`, `cuda`, `gpu` or a torch.Device. Got {device}.')
+            else:
+                return device
+        else:  # Catch any other unexpected issues
+            raise TypeError('The device is not recognised as a supported type.')
+
+
 # Custom BaseModel so that we can set default config
 class CustomBaseModel(BaseModel):
     """
@@ -295,7 +322,7 @@ class PreprocessConfig(CustomBaseModel):
     Optional tokenizer for text drift. Either a string referencing a HuggingFace tokenizer model name, or a
     :class:`~alibi_detect.utils.schemas.TokenizerConfig`.
     """
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[Literal['cpu', 'cuda', 'gpu']] = None
     """
     Device type used. The default `None` tries to use the GPU and falls back on CPU if needed. Only relevant if
     `src='@cd.torch.preprocess.preprocess_drift'`
@@ -682,7 +709,7 @@ class MMDDriftConfig(DriftDetectorConfig):
     configure_kernel_from_x_ref: bool = True
     n_permutations: int = 100
     batch_size_permutations: int = 1000000
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
 
 
 class MMDDriftConfigResolved(DriftDetectorConfigResolved):
@@ -702,7 +729,7 @@ class MMDDriftConfigResolved(DriftDetectorConfigResolved):
     configure_kernel_from_x_ref: bool = True
     n_permutations: int = 100
     batch_size_permutations: int = 1000000
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
 
 
 class LSDDDriftConfig(DriftDetectorConfig):
@@ -721,7 +748,7 @@ class LSDDDriftConfig(DriftDetectorConfig):
     n_permutations: int = 100
     n_kernel_centers: Optional[int] = None
     lambda_rd_max: float = 0.2
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
 
 
 class LSDDDriftConfigResolved(DriftDetectorConfigResolved):
@@ -740,7 +767,7 @@ class LSDDDriftConfigResolved(DriftDetectorConfigResolved):
     n_permutations: int = 100
     n_kernel_centers: Optional[int] = None
     lambda_rd_max: float = 0.2
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
 
 
 class ClassifierDriftConfig(DriftDetectorConfig):
@@ -772,7 +799,7 @@ class ClassifierDriftConfig(DriftDetectorConfig):
     verbose: int = 0
     train_kwargs: Optional[dict] = None
     dataset: Optional[str] = None
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     dataloader: Optional[str] = None  # TODO: placeholder, will need to be updated for pytorch implementation
     use_calibration: bool = False
     calibration_kwargs: Optional[dict] = None
@@ -808,7 +835,7 @@ class ClassifierDriftConfigResolved(DriftDetectorConfigResolved):
     verbose: int = 0
     train_kwargs: Optional[dict] = None
     dataset: Optional[Callable] = None
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     dataloader: Optional[Callable] = None  # TODO: placeholder, will need to be updated for pytorch implementation
     use_calibration: bool = False
     calibration_kwargs: Optional[dict] = None
@@ -843,7 +870,7 @@ class SpotTheDiffDriftConfig(DriftDetectorConfig):
     n_diffs: int = 1
     initial_diffs: Optional[str] = None
     l1_reg: float = 0.01
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     dataloader: Optional[str] = None  # TODO: placeholder, will need to be updated for pytorch implementation
 
 
@@ -875,7 +902,7 @@ class SpotTheDiffDriftConfigResolved(DriftDetectorConfigResolved):
     n_diffs: int = 1
     initial_diffs: Optional[np.ndarray] = None
     l1_reg: float = 0.01
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     dataloader: Optional[Callable] = None  # TODO: placeholder, will need to be updated for pytorch implementation
 
 
@@ -909,7 +936,7 @@ class LearnedKernelDriftConfig(DriftDetectorConfig):
     verbose: int = 0
     train_kwargs: Optional[dict] = None
     dataset: Optional[str] = None
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     dataloader: Optional[str] = None  # TODO: placeholder, will need to be updated for pytorch implementation
 
 
@@ -943,7 +970,7 @@ class LearnedKernelDriftConfigResolved(DriftDetectorConfigResolved):
     verbose: int = 0
     train_kwargs: Optional[dict] = None
     dataset: Optional[Callable] = None
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     dataloader: Optional[Callable] = None  # TODO: placeholder, will need to be updated for pytorch implementation
 
 
@@ -968,7 +995,7 @@ class ContextMMDDriftConfig(DriftDetectorConfig):
     n_folds: int = 5
     batch_size: Optional[int] = 256
     verbose: bool = False
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
 
 
 class ContextMMDDriftConfigResolved(DriftDetectorConfigResolved):
@@ -991,7 +1018,7 @@ class ContextMMDDriftConfigResolved(DriftDetectorConfigResolved):
     n_folds: int = 5
     batch_size: Optional[int] = 256
     verbose: bool = False
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
 
 
 class MMDDriftOnlineConfig(DriftDetectorConfig):
@@ -1009,7 +1036,7 @@ class MMDDriftOnlineConfig(DriftDetectorConfig):
     kernel: Optional[Union[str, KernelConfig]] = None
     sigma: Optional[np.ndarray] = None
     n_bootstraps: int = 1000
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     verbose: bool = True
 
 
@@ -1028,7 +1055,7 @@ class MMDDriftOnlineConfigResolved(DriftDetectorConfigResolved):
     kernel: Optional[Callable] = None
     sigma: Optional[np.ndarray] = None
     n_bootstraps: int = 1000
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     verbose: bool = True
 
 
@@ -1048,7 +1075,7 @@ class LSDDDriftOnlineConfig(DriftDetectorConfig):
     n_bootstraps: int = 1000
     n_kernel_centers: Optional[int] = None
     lambda_rd_max: float = 0.2
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     verbose: bool = True
 
 
@@ -1068,7 +1095,7 @@ class LSDDDriftOnlineConfigResolved(DriftDetectorConfigResolved):
     n_bootstraps: int = 1000
     n_kernel_centers: Optional[int] = None
     lambda_rd_max: float = 0.2
-    device: Optional[Literal['cpu', 'cuda']] = None
+    device: Optional[SupportedDevice] = None
     verbose: bool = True
 
 
@@ -1178,7 +1205,7 @@ class ClassifierUncertaintyDriftConfig(DetectorConfig):
     margin_width: float = 0.1
     batch_size: int = 32
     preprocess_batch_fn: Optional[str] = None
-    device: Optional[str] = None
+    device: Optional[SupportedDevice] = None
     tokenizer: Optional[Union[str, TokenizerConfig]] = None
     max_len: Optional[int] = None
     input_shape: Optional[tuple] = None
@@ -1205,7 +1232,7 @@ class ClassifierUncertaintyDriftConfigResolved(DetectorConfig):
     margin_width: float = 0.1
     batch_size: int = 32
     preprocess_batch_fn: Optional[Callable] = None
-    device: Optional[str] = None
+    device: Optional[SupportedDevice] = None
     tokenizer: Optional[Union[str, Callable]] = None
     max_len: Optional[int] = None
     input_shape: Optional[tuple] = None
@@ -1231,7 +1258,7 @@ class RegressorUncertaintyDriftConfig(DetectorConfig):
     n_evals: int = 25
     batch_size: int = 32
     preprocess_batch_fn: Optional[str] = None
-    device: Optional[str] = None
+    device: Optional[SupportedDevice] = None
     tokenizer: Optional[Union[str, TokenizerConfig]] = None
     max_len: Optional[int] = None
     input_shape: Optional[tuple] = None
@@ -1257,7 +1284,7 @@ class RegressorUncertaintyDriftConfigResolved(DetectorConfig):
     n_evals: int = 25
     batch_size: int = 32
     preprocess_batch_fn: Optional[Callable] = None
-    device: Optional[str] = None
+    device: Optional[SupportedDevice] = None
     tokenizer: Optional[Callable] = None
     max_len: Optional[int] = None
     input_shape: Optional[tuple] = None
diff --git a/alibi_detect/saving/tests/test_saving.py b/alibi_detect/saving/tests/test_saving.py
index 19a7b4cc1..f33726174 100644
--- a/alibi_detect/saving/tests/test_saving.py
+++ b/alibi_detect/saving/tests/test_saving.py
@@ -1365,3 +1365,37 @@ def test_cleanup(tmp_path):
 
     # Check `filepath` is deleted
     assert not tmp_path.is_dir()
+
+
+@pytest.mark.parametrize('backend, device', [
+    ('pytorch', 'cpu'),
+    ('pytorch', 'gpu'),
+    ('pytorch', 'cuda'),
+    ('pytorch', 'cuda:0'),
+    ('pytorch', torch.device('cuda')),
+    ('pytorch', torch.device('cuda:0')),
+    ('tensorflow', None),
+])
+@parametrize_with_cases("data", cases=ContinuousData, prefix='data_')
+def test_save_detector_device(backend, device, data, tmp_path, classifier_model):  # noqa: F811
+    """
+    Test saving a Detector with different pytorch device options.
+
+    Save using `save_detector` and load using `load_detector`, with assertions checking that the reinstantiated
+    detector is equivalent. Also check that the detector config toml device string is correct.
+    """
+    X_ref, X_h0 = data
+    detector = ClassifierDrift(
+        X_ref,
+        backend=backend,
+        model=classifier_model,
+        device=device
+    )
+    save_detector(detector, tmp_path)
+    detector_config = toml.load(tmp_path / 'config.toml')
+    loaded_detector = load_detector(tmp_path)
+    if backend == 'tensorflow':
+        assert detector_config['device'] == 'None'
+    else:
+        assert detector_config['device'] in {'cpu', 'gpu', 'cuda'}
+        assert loaded_detector._detector.device in {torch.device('cpu'), torch.device('cuda')}
diff --git a/alibi_detect/tests/test_dep_management.py b/alibi_detect/tests/test_dep_management.py
index e83a23e26..7d037a2f5 100644
--- a/alibi_detect/tests/test_dep_management.py
+++ b/alibi_detect/tests/test_dep_management.py
@@ -239,7 +239,8 @@ def test_saving_torch_dependencies(opt_dep):
         ('load_optimizer_pt', ['torch', 'keops']),
         ('prep_model_and_emb_pt', ['torch', 'keops']),
         ('save_model_config_pt', ['torch', 'keops']),
-        ('get_pt_dtype', ['torch', 'keops'])
+        ('get_pt_dtype', ['torch', 'keops']),
+        ('save_device_pt', ['torch', 'keops'])
     ]:
         dependency_map[dependency] = relations
     from alibi_detect.saving import _pytorch as pt_saving
diff --git a/alibi_detect/utils/_types.py b/alibi_detect/utils/_types.py
index 754ead31c..2f46971c8 100644
--- a/alibi_detect/utils/_types.py
+++ b/alibi_detect/utils/_types.py
@@ -1,16 +1,13 @@
 """
 Defining types compatible with different Python versions and defining custom types.
 """
-import sys
 from sklearn.base import BaseEstimator  # import here (instead of later) since sklearn currently a core dep
 from alibi_detect.utils.frameworks import has_tensorflow, has_pytorch
-from typing import Union, Type
+from typing import Union, Type, Optional
+
 
 # Literal for typing
-if sys.version_info >= (3, 8):
-    from typing import Literal  # noqa
-else:
-    from typing_extensions import Literal  # noqa
+from typing_extensions import Literal
 from typing_extensions import TypeAlias
 
 
@@ -37,3 +34,5 @@
 # type aliases, for use with mypy (must be FwdRef's if involving opt. deps.)
 OptimizerTF: TypeAlias = Union['tf.keras.optimizers.Optimizer', 'tf.keras.optimizers.legacy.Optimizer',
                                Type['tf.keras.optimizers.Optimizer'], Type['tf.keras.optimizers.legacy.Optimizer']]
+
+TorchDeviceType: TypeAlias = Optional[Union[Literal['cuda', 'gpu', 'cpu'], 'torch.device']]
diff --git a/alibi_detect/utils/pytorch/misc.py b/alibi_detect/utils/pytorch/misc.py
index 8f7e6e357..f278afade 100644
--- a/alibi_detect/utils/pytorch/misc.py
+++ b/alibi_detect/utils/pytorch/misc.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Optional, Union, Type
+from typing import Type
+from alibi_detect.utils._types import TorchDeviceType
 
 import torch
 
@@ -66,14 +67,14 @@ def quantile(sample: torch.Tensor, p: float, type: int = 7, sorted: bool = False
     return float(quantile)
 
 
-def get_device(device: Optional[Union[str, torch.device]] = None) -> torch.device:
+def get_device(device: TorchDeviceType = None) -> torch.device:
     """
     Instantiates a PyTorch device object.
 
     Parameters
     ----------
     device
-        Either `None`, a str ('gpu' or 'cpu') indicating the device to choose, or an already instantiated device
+        Either `None`, a str ('gpu', 'cuda' or 'cpu') indicating the device to choose, or an already instantiated device
         object. If `None`, the GPU is selected if it is detected, otherwise the CPU is used as a fallback.
 
     Returns
diff --git a/alibi_detect/utils/pytorch/prediction.py b/alibi_detect/utils/pytorch/prediction.py
index b1cc79b7d..e6b332fd2 100644
--- a/alibi_detect/utils/pytorch/prediction.py
+++ b/alibi_detect/utils/pytorch/prediction.py
@@ -1,15 +1,16 @@
 from functools import partial
-from typing import Callable, Optional, Type, Union
+from typing import Callable, Type, Union
 
 import numpy as np
 import torch
 import torch.nn as nn
 from alibi_detect.utils.pytorch.misc import get_device
 from alibi_detect.utils.prediction import tokenize_transformer
+from alibi_detect.utils._types import TorchDeviceType
 
 
 def predict_batch(x: Union[list, np.ndarray, torch.Tensor], model: Union[Callable, nn.Module, nn.Sequential],
-                  device: Optional[torch.device] = None, batch_size: int = int(1e10), preprocess_fn: Callable = None,
+                  device: TorchDeviceType = None, batch_size: int = int(1e10), preprocess_fn: Callable = None,
                   dtype: Union[Type[np.generic], torch.dtype] = np.float32) -> Union[np.ndarray, torch.Tensor, tuple]:
     """
     Make batch predictions on a model.
@@ -21,8 +22,9 @@ def predict_batch(x: Union[list, np.ndarray, torch.Tensor], model: Union[Callabl
     model
         PyTorch model.
     device
-        Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-        Can be specified by passing either torch.device('cuda') or torch.device('cpu').
+        Device type used. The default tries to use the GPU and falls back on CPU if needed.
+        Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+        ``torch.device``.
     batch_size
         Batch size used during prediction.
     preprocess_fn
@@ -74,7 +76,7 @@ def predict_batch(x: Union[list, np.ndarray, torch.Tensor], model: Union[Callabl
 
 
 def predict_batch_transformer(x: Union[list, np.ndarray], model: Union[nn.Module, nn.Sequential],
-                              tokenizer: Callable, max_len: int, device: Optional[torch.device] = None,
+                              tokenizer: Callable, max_len: int, device: TorchDeviceType = None,
                               batch_size: int = int(1e10), dtype: Union[Type[np.generic], torch.dtype] = np.float32) \
         -> Union[np.ndarray, torch.Tensor, tuple]:
     """
@@ -91,8 +93,9 @@ def predict_batch_transformer(x: Union[list, np.ndarray], model: Union[nn.Module
     max_len
         Max sequence length for tokens.
     device
-        Device type used. The default None tries to use the GPU and falls back on CPU if needed.
-        Can be specified by passing either torch.device('cuda') or torch.device('cpu').
+        Device type used. The default tries to use the GPU and falls back on CPU if needed.
+        Can be specified by passing either ``'cuda'``, ``'gpu'``, ``'cpu'`` or an instance of
+        ``torch.device``.
     batch_size
         Batch size used during prediction.
     dtype