coreylowman · DonIsaac · Jan 1, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/dfdx-core/Cargo.toml b/dfdx-core/Cargo.toml
@@ -25,22 +25,24 @@ keywords = [
 features = ["nightly", "numpy", "safetensors", "cuda", "ci-check"]
 
 [dependencies]
+bytemuck = { version = "1.7.0", optional = true }
+cudarc = { version = "0.9.15", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
+futures-lite = { version = "2.0.1", optional = true }
+gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
+half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
+libm = { workspace = true }
+memmap2 = { workspace = true, optional = true }
 no-std-compat = { version = "0.4.1", default-features = false, features = [ "alloc", "compat_hash" ], optional = true }
-spin = { version = "0.9.8", default-features = false, features = ["spin_mutex", "rwlock", "portable_atomic"], optional = true }
+num-traits = { workspace = true }
 rand = { workspace = true }
 rand_distr = { workspace = true }
-zip = { version = "0.6.6", default-features = false, optional = true }
-cudarc = { version = "0.9.15", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
-num-traits = { workspace = true }
-safetensors = { workspace = true, optional = true }
-memmap2 = { workspace = true, optional = true }
-half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
-gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
 rayon = { version = "1.7.0", optional = true }
-libm = { workspace = true }
-wgpu = { version = "0.18.0", optional = true }
-futures-lite = { version = "2.0.1", optional = true }
+safetensors = { workspace = true, optional = true }
+spin = { version = "0.9.8", default-features = false, features = ["spin_mutex", "rwlock", "portable_atomic"], optional = true }
+static_assertions = { version = "1.1.0", optional = true }
 thingbuf = { version = "0.1.4", optional = true }
+wgpu = { version = "0.18.0", optional = true }
+zip = { version = "0.6.6", default-features = false, optional = true }
 
 [dev-dependencies]
 tempfile = "3.3.0"
@@ -62,7 +64,7 @@ fast-alloc = ["std"]
 
 cuda = ["dep:cudarc", "dep:glob"]
 cudnn = ["cuda", "cudarc?/cudnn"]
-webgpu = ["dep:wgpu", "dep:futures-lite", "dep:thingbuf", "wgpu/expose-ids"]
+webgpu = ["dep:wgpu", "dep:futures-lite", "dep:thingbuf", "dep:bytemuck", "dep:static_assertions", "wgpu/expose-ids"]
 
 f16 = ["dep:half", "cudarc?/f16", "gemm?/f16"]
 

diff --git a/dfdx-core/src/tensor/cache.rs b/dfdx-core/src/tensor/cache.rs
@@ -74,7 +74,7 @@ impl<Ptr> TensorCache<Ptr> {
         }
     }
 
-    /// Disables the cache.
+    /// Enables the cache.
     pub(crate) fn enable(&self) {
         #[cfg(not(feature = "no-std"))]
         {

diff --git a/dfdx-core/src/tensor/mod.rs b/dfdx-core/src/tensor/mod.rs
@@ -146,7 +146,7 @@ mod masks;
 #[cfg(feature = "numpy")]
 pub(crate) mod numpy;
 #[cfg(feature = "webgpu")]
-pub(crate) mod webgpu;
+pub mod webgpu;
 #[cfg(feature = "numpy")]
 pub use numpy::NumpyDtype;
 mod error;
@@ -177,7 +177,7 @@ pub type AutoDevice = Cuda;
 #[cfg(feature = "webgpu")]
 pub use webgpu::Webgpu;
 #[cfg(feature = "webgpu")]
-pub type AutoDevice = Webgpu;
+pub type AutoDevice = Cpu; // todo
 
 pub use storage_traits::{AsArray, CopySlice, TensorFrom, TensorFromVec, TensorToArray};
 pub use storage_traits::{Cache, RandomU64, Storage, Synchronize};

diff --git a/dfdx-core/src/tensor/webgpu/allocate.rs b/dfdx-core/src/tensor/webgpu/allocate.rs
@@ -112,7 +112,7 @@ where
 
 impl<E: Unit> OneFillStorage<E> for Webgpu {
     fn try_fill_with_ones(&self, storage: &mut Self::Vec) -> Result<(), Error> {
-        let len = storage.size() as usize / std::mem::size_of::<E>();
+        let len = storage.size() / std::mem::size_of::<E>();
         let buf = vec![E::ONE; len];
         storage
             .data
@@ -171,7 +171,7 @@ where
 impl<E: Unit> CopySlice<E> for Webgpu {
     fn copy_from<S: Shape, T>(dst: &mut Tensor<S, E, Self, T>, src: &[E]) {
         assert_eq!(
-            dst.data.size() as usize,
+            dst.data.size() ,
             src.len() * std::mem::size_of::<E>(),
             "Slices must have same number of elements as *physical* Storage<E> of tensors."
         );
@@ -182,7 +182,7 @@ impl<E: Unit> CopySlice<E> for Webgpu {
 
     fn copy_into<S: Shape, T>(src: &Tensor<S, E, Self, T>, dst: &mut [E]) {
         assert_eq!(
-            src.data.size() as usize,
+            src.data.size(),
             dst.len() * std::mem::size_of::<E>(),
             "Slices must have same number of elements as *physical* Storage<E> of tensors."
         );

diff --git a/dfdx-core/src/tensor/webgpu/device.rs b/dfdx-core/src/tensor/webgpu/device.rs
@@ -3,6 +3,7 @@ use wgpu::{
     RequestDeviceError,
 };
 
+use super::resources::{binary_op_layout_desc, unary_op_layout_desc};
 use crate::{
     shapes::{Shape, Unit},
     tensor::{
@@ -19,6 +20,8 @@ use std::sync::Mutex;
 
 use std::{marker::PhantomData, sync::Arc, vec::Vec};
 
+use futures_lite::future::block_on;
+
 use super::allocate::round_to_buffer_alignment;
 
 #[derive(Debug)]
@@ -102,6 +105,12 @@ pub struct Webgpu {
     pub(crate) queue: Arc<Queue>,
 
     pub(crate) cache: Arc<TensorCache<Buffer>>,
+
+    // pipeline resources
+    /// `[unary, binary]` pipeline layouts
+    ///
+    /// storing them for re-use reduces resource allocation pressure on the GPU
+    pub(super) layouts: [Arc<wgpu::BindGroupLayout>; 2],
 }
 
 impl From<RequestDeviceError> for Error {
@@ -129,16 +138,42 @@ impl Webgpu {
         #[cfg(not(feature = "no-std"))]
         let _lock = { CONSTRUCTOR_MUTEX.lock().unwrap() };
 
-        let cpu = Cpu::seed_from_u64(seed);
+        #[cfg(not(feature = "f16"))]
+        let features: wgpu::Features = Default::default() | wgpu::Features::PUSH_CONSTANTS;
+        #[cfg(feature = "f16")]
+        let features: wgpu::Features =
+            wgpu::Features::default() | wgpu::Features::PUSH_CONSTANTS | wgpu::Features::SHADER_F16;
+
+        let limits: wgpu::Limits = Default::default();
+        let device_desc = wgpu::DeviceDescriptor {
+            label: Some("dfdx"),
+            features,
+            limits,
+        };
+        let adapter_desc = wgpu::RequestAdapterOptions {
+            power_preference: wgpu::PowerPreference::HighPerformance,
+            ..Default::default()
+        };
+
+        // request adapter
         let instance = Arc::new(Instance::new(InstanceDescriptor::default()));
-        let adapter = futures_lite::future::block_on(instance.request_adapter(&Default::default()))
+        // note: may also fail b/c adapter doesn't support requested features/limits
+        let adapter = block_on(instance.request_adapter(&adapter_desc))
             .ok_or(Error::WebgpuAdapterNotFound)?;
         let adapter = Arc::new(adapter);
-        let (dev, queue) =
-            futures_lite::future::block_on(adapter.request_device(&Default::default(), None))?;
+
+        // request device from adapter
+        let (dev, queue) = block_on(adapter.request_device(&device_desc, None))?;
         let dev = Arc::new(dev);
         let queue = Arc::new(queue);
 
+        let cpu = Cpu::seed_from_u64(seed);
+
+        let layouts = [
+            Arc::new(dev.create_bind_group_layout(&unary_op_layout_desc())),
+            Arc::new(dev.create_bind_group_layout(&binary_op_layout_desc())),
+        ];
+
         Ok(Self {
             cpu,
             instance,
@@ -147,18 +182,68 @@ impl Webgpu {
             queue,
 
             cache: Default::default(),
+
+            layouts,
         })
     }
+
+    /// Submit a command buffer to the GPU.
+    ///
+    /// Note: Does not block until completion. If you need this, use
+    /// `self.dev.poll(Maintain::WaitForSubmissionIndex(idx))` using the
+    /// returned [`wgpu::SubmissionIndex`]
+    pub(crate) fn submit_commands<F>(&self, command_builder: F) -> wgpu::SubmissionIndex
+    where
+        F: FnOnce(&mut wgpu::CommandEncoder),
+    {
+        let mut encoder = self
+            .dev
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: Some("submit_commands"),
+            });
+        command_builder(&mut encoder);
+        let cmd = [encoder.finish()];
+        self.queue.submit(cmd)
+    }
+
+    /// Convienence function for submitting single-stage compute operations.
+    ///
+    /// see: [`submit_commands`]
+    pub(crate) fn submit_basic_op(
+        &self,
+        pipeline: &wgpu::ComputePipeline,
+        params: &wgpu::BindGroup,
+        label: Option<&str>,
+        work_groups: &(u32, u32, u32),
+    ) -> wgpu::SubmissionIndex {
+        return self.submit_commands(|encoder| {
+            let (x, y, z) = *work_groups;
+            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label,
+                ..Default::default()
+            });
+            if let Some(label) = label {
+                pass.push_debug_group(label);
+            }
+            pass.set_pipeline(pipeline);
+            pass.set_bind_group(0, params, &[]);
+            pass.dispatch_workgroups(x, y, z);
+            if label.is_some() {
+                pass.pop_debug_group();
+            }
+        });
+    }
 }
 
 impl Webgpu {
+    // todo: support configuration of usage flags
     pub(crate) unsafe fn alloc_empty<E>(&self, len: usize) -> Result<Buffer, Error> {
         let data = self.cache.try_pop::<E>(len).map_or_else(
             || Buffer {
                 data: self.dev.create_buffer(&BufferDescriptor {
                     label: None,
                     size: round_to_buffer_alignment((len * std::mem::size_of::<E>()) as u64),
-                    usage: BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
+                    usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
                     mapped_at_creation: false,
                 }),
                 size: len * std::mem::size_of::<E>(),
@@ -198,7 +283,7 @@ pub struct CachableBuffer<E> {
 
 impl<E> Clone for CachableBuffer<E> {
     fn clone(&self) -> Self {
-        let len = self.data.size() as usize / std::mem::size_of::<E>();
+        let len = self.data.size() / std::mem::size_of::<E>();
         let (encoder, data) = self.cache.try_pop::<E>(len).map_or_else(
             || {
                 let mut encoder = self.dev.create_command_encoder(&Default::default());
@@ -213,7 +298,7 @@ impl<E> Clone for CachableBuffer<E> {
                     encoder,
                     Buffer {
                         data: bfr,
-                        size: self.data.size as usize,
+                        size: self.data.size,
                     },
                 )
             },

diff --git a/dfdx-core/src/tensor/webgpu/mod.rs b/dfdx-core/src/tensor/webgpu/mod.rs
@@ -1,8 +1,11 @@
 mod allocate;
 mod device;
+mod resources;
+mod types;
 
 pub use device::Buffer;
 pub use device::Webgpu;
+pub(crate) use types::WebgpuNativeType;
 
 #[cfg(test)]
 mod tests {