diff --git a/crates/llama_cpp/src/model/mod.rs b/crates/llama_cpp/src/model/mod.rs index 3bfed6f..e1375c8 100644 --- a/crates/llama_cpp/src/model/mod.rs +++ b/crates/llama_cpp/src/model/mod.rs @@ -67,7 +67,7 @@ pub enum LlamaTokenizationError { /// /// This is a thin wrapper over an `Arc<*mut llama_model>`, which is used to share the /// model across threads. -#[derive(Clone, Deref, DerefMut)] +#[derive(Deref, DerefMut)] struct LlamaModelInner { #[deref] #[deref_mut] @@ -77,8 +77,6 @@ struct LlamaModelInner { unsafe impl Send for LlamaModelInner {} -unsafe impl Sync for LlamaModelInner {} - impl Drop for LlamaModelInner { fn drop(&mut self) { unsafe { @@ -100,7 +98,7 @@ impl Drop for LlamaModelInner { #[derive(Clone)] pub struct LlamaModel { /// A handle to the inner model on the other side of the C FFI boundary. - model: Arc, + model: Arc>, /// The size of this model's vocabulary, in tokens. vocabulary_size: usize, @@ -230,10 +228,10 @@ impl LlamaModel { .unwrap_or(0); Ok(Self { - model: Arc::new(LlamaModelInner { + model: Arc::new(Mutex::new(LlamaModelInner { model, _backend_ref: backend_ref, - }), + })), vocabulary_size: vocabulary_size as usize, bos_token: Token(unsafe { llama_token_bos(model) }), eos_token: Token(unsafe { llama_token_eos(model) }), @@ -293,6 +291,8 @@ impl LlamaModel { let mut out_buf = Vec::with_capacity(content.len() + 2); let n_written_tokens = unsafe { + let model_lock = self.model.lock().unwrap(); + // SAFETY: The pointer ranges specified here are always valid, and `n_written_tokens` // is always less than `content.len()`. // @@ -300,7 +300,7 @@ impl LlamaModel { // // `out_buf` is a `Vec`, and `Token` is `#[repr(transparent)]` over an `i32`. llama_tokenize( - **self.model, + **model_lock, content.as_ptr() as *const i8, content.len() as i32, out_buf.as_mut_ptr() as *mut i32, @@ -356,7 +356,11 @@ impl LlamaModel { token.0 ); - unsafe { CStr::from_ptr(llama_token_get_text(**self.model, token.0)) }.to_bytes() + unsafe { + let model_lock = self.model.lock().unwrap(); + CStr::from_ptr(llama_token_get_text(**model_lock, token.0)) + } + .to_bytes() } /// Converts the provided token into a `Vec` piece, using the model's vocabulary. @@ -365,11 +369,12 @@ impl LlamaModel { pub fn token_to_byte_piece(&self, token: Token) -> Vec { let initial_size = 8u16; let mut buffer = vec![0u8; usize::from(initial_size)]; + let model_lock = self.model.lock().unwrap(); let size = unsafe { // SAFETY: Casting `*mut u8` to `*mut i8` is safe because `u8` and // `i8` have the same size and alignment. llama_token_to_piece( - **self.model, + **model_lock, token.0, buffer.as_mut_ptr() as *mut i8, std::os::raw::c_int::from(initial_size), @@ -383,7 +388,7 @@ impl LlamaModel { // and `i8` have the same size and alignment. The length of // buffer is accurate for this reason. llama_token_to_piece( - **self.model, + **model_lock, token.0, buffer.as_mut_ptr() as *mut i8, std::os::raw::c_int::from(buffer.len() as i32), @@ -421,11 +426,13 @@ impl LlamaModel { let token_buf = &mut buf[i..]; let size = unsafe { + let model_lock = self.model.lock().unwrap(); + // SAFETY: Casting `*mut u8` to `*mut i8` is safe because `u8` and // `i8` have the same size and alignment. The length of token_buf is // accurate for this reason. llama_cpp_sys::llama_token_to_piece( - **self.model, + **model_lock, t.0, token_buf.as_mut_ptr() as *mut i8, token_buf.len() as i32, @@ -463,9 +470,11 @@ impl LlamaModel { let max_batch = params.n_batch; let ctx = unsafe { + let model_lock = self.model.lock().unwrap(); + // SAFETY: due to `_model` being declared in the `LlamaContext`, `self` must live // for at least the lifetime of `LlamaContext`. - llama_new_context_with_model(**self.model, params) + llama_new_context_with_model(**model_lock, params) }; if ctx.is_null() { return Err(LlamaContextError::SessionFailed); @@ -640,9 +649,11 @@ impl LlamaModel { let context_params = params.as_context_params(batch_capacity); let context = unsafe { + let model_lock = self.model.lock().unwrap(); + // SAFETY: due to `_model` being declared in the `LlamaContext`, `self` must live // for at least the lifetime of `LlamaContext`. - llama_new_context_with_model(**self.model, context_params) + llama_new_context_with_model(**model_lock, context_params) }; if context.is_null() { diff --git a/crates/llama_cpp/src/session/mod.rs b/crates/llama_cpp/src/session/mod.rs index 1dcf7ce..24175fc 100644 --- a/crates/llama_cpp/src/session/mod.rs +++ b/crates/llama_cpp/src/session/mod.rs @@ -1,5 +1,6 @@ //! Functionality for the [`LlamaSession`] struct +use derive_more::{Deref, DerefMut}; use std::cmp::min; use std::ops::{Bound, RangeBounds}; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -30,6 +31,7 @@ pub use params::*; /// The inner part of a [`LlamaSession`]. /// /// This is wrapped in an `Arc` for sharing across thread boundaries. +#[derive(Deref, DerefMut)] pub(crate) struct LlamaContextInner { /// A pointer to the inner context. pub(crate) ptr: *mut llama_context, @@ -37,8 +39,6 @@ pub(crate) struct LlamaContextInner { unsafe impl Send for LlamaContextInner {} -unsafe impl Sync for LlamaContextInner {} - impl Drop for LlamaContextInner { fn drop(&mut self) { // SAFETY: `drop`ping more than once is unsound [1], so `self.model` cannot have been @@ -173,9 +173,11 @@ impl LlamaSession { trace!("Starting LLaMA decode for batch"); let err = unsafe { + let session_guard = self.inner.ctx.lock().unwrap(); + // SAFETY: `llama_decode` will not fail for a valid `batch`, which we correctly // initialized above. - llama_decode(self.inner.ctx.lock().unwrap().ptr, batch.handle()) + llama_decode(**session_guard, batch.handle()) }; if err != 0 { return Err(LlamaContextError::DecodeFailed(err)); @@ -281,12 +283,12 @@ impl LlamaSession { if session.inner.last_batch_size.load(Ordering::SeqCst) == 0 { // Remove last token unsafe { - llama_kv_cache_seq_rm(context.ptr, -1, token_buf.len() as i32 - 1, -1); + llama_kv_cache_seq_rm(**context, -1, token_buf.len() as i32 - 1, -1); } // Decode last token batch.add(*token_buf.last().unwrap(), current_pos, &[0], true); - let res = unsafe { llama_decode(context.ptr, batch.handle()) }; + let res = unsafe { llama_decode(**context, batch.handle()) }; if res != 0 { error!("Failed to decode context ({res})"); @@ -305,7 +307,7 @@ impl LlamaSession { // Get logit values from the model and store them in a `llama_token_data_array` let mut candidates: Vec = { let i = session.inner.last_batch_size.load(Ordering::SeqCst); - let logits = unsafe { llama_get_logits_ith(context.ptr, (i - 1) as i32) }; + let logits = unsafe { llama_get_logits_ith(**context, (i - 1) as i32) }; let logits = unsafe { std::slice::from_raw_parts(logits, vocab) }; logits @@ -326,7 +328,7 @@ impl LlamaSession { }; // Select the next token - let token = sampler.sample(context.ptr, &token_buf, candidates_p); + let token = sampler.sample(**context, &token_buf, candidates_p); // Send the token to the `CompletionHandle`, exiting on failure if let Err(e) = tx.send(token) { @@ -342,7 +344,7 @@ impl LlamaSession { // Create a batch with the generated token and decode it batch.add(token, current_pos, &[0], true); - let res = unsafe { llama_decode(context.ptr, batch.handle()) }; + let res = unsafe { llama_decode(**context, batch.handle()) }; if res != 0 { error!("Failed to decode context ({res})"); @@ -408,10 +410,12 @@ impl LlamaSession { Bound::Unbounded => -1, }; - let context = self.inner.ctx.lock().unwrap(); - // -1 here to match all sequences - let success = unsafe { llama_kv_cache_seq_rm(context.ptr, -1, start_bound, end_bound) }; + let success = unsafe { + let context = self.inner.ctx.lock().unwrap(); + + llama_kv_cache_seq_rm(**context, -1, start_bound, end_bound) + }; if !success { return Err(LlamaContextError::InvalidRange); @@ -511,7 +515,7 @@ impl LlamaSession { #[allow(unused_mut)] let mut copy = self.model().create_session(self.inner.params.clone())?; - let size = unsafe { llama_get_state_size(ctx.ptr) }; + let size = unsafe { llama_get_state_size(**ctx) }; let mut buf = vec![0; size]; // SAFETY: `llama_copy_state_data` and `llama_set_state_data` should never write/read more than @@ -519,10 +523,10 @@ impl LlamaSession { // // `copy` was created from the same model as `self` and with the same parameters. unsafe { - let copy_size = llama_copy_state_data(ctx.ptr, buf.as_mut_ptr()); + let copy_size = llama_copy_state_data(**ctx, buf.as_mut_ptr()); assert!(copy_size <= size); - let set_size = - llama_set_state_data(copy.inner.ctx.lock().unwrap().ptr, buf.as_mut_ptr()); + let copy_guard = copy.inner.ctx.lock().unwrap(); + let set_size = llama_set_state_data(**copy_guard, buf.as_mut_ptr()); assert_eq!(copy_size, set_size); } @@ -542,6 +546,6 @@ impl LlamaSession { /// Currently there is no way to check the amount of memory occupied in devices. pub fn memory_size(&self) -> usize { let ctx = self.inner.ctx.lock().unwrap(); - unsafe { llama_get_state_size(ctx.ptr) } + unsafe { llama_get_state_size(**ctx) } } } diff --git a/crates/llama_cpp_sys/build.rs b/crates/llama_cpp_sys/build.rs index 34de349..5b6b3b8 100644 --- a/crates/llama_cpp_sys/build.rs +++ b/crates/llama_cpp_sys/build.rs @@ -1,5 +1,5 @@ use std::env; -use std::fs::File; +use std::fs::{read_dir, File}; use std::io::Write; use std::path::{Path, PathBuf}; use std::process::Command; @@ -424,8 +424,8 @@ fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'stati // CUDA gets linked through the cudarc crate. - cx.define("GGML_USE_CUBLAS", None); - cxx.define("GGML_USE_CUBLAS", None); + cx.define("GGML_USE_CUDA", None); + cxx.define("GGML_USE_CUDA", None); let mut nvcc = featless_cxx; nvcc.cuda(true) @@ -453,9 +453,17 @@ fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'stati } let lib_name = "ggml-cuda"; - - nvcc.file(LLAMA_PATH.join("ggml-cuda.cu")) - .include(LLAMA_PATH.join("ggml-cuda.h")) + let cuda_path = LLAMA_PATH.join("ggml-cuda"); + let cuda_sources = read_dir(cuda_path.as_path()) + .unwrap() + .map(|f| f.unwrap()) + .filter(|entry| entry.file_name().to_string_lossy().ends_with(".cu")) + .map(|entry| entry.path()); + + nvcc.include(cuda_path.as_path()) + .include(LLAMA_PATH.as_path()) + .files(cuda_sources) + .file(LLAMA_PATH.join("ggml-cuda.cu")) .compile(lib_name); lib_name @@ -579,6 +587,7 @@ fn compile_llama(mut cxx: Build, _out_path: impl AsRef) { println!("Compiling Llama.cpp.."); cxx.include(LLAMA_PATH.as_path()) .file(LLAMA_PATH.join("unicode.cpp")) + .file(LLAMA_PATH.join("unicode-data.cpp")) .file(LLAMA_PATH.join("llama.cpp")) .compile("llama"); } diff --git a/crates/llama_cpp_sys/include/build-info.h b/crates/llama_cpp_sys/include/build-info.h index 03eb6dd..17561f1 100644 --- a/crates/llama_cpp_sys/include/build-info.h +++ b/crates/llama_cpp_sys/include/build-info.h @@ -13,7 +13,7 @@ #ifndef BUILD_INFO_H #define BUILD_INFO_H -#define BUILD_NUMBER 2465 -#define BUILD_COMMIT "d0d5de4" +#define BUILD_NUMBER 2589 +#define BUILD_COMMIT "60cdf40" #endif // BUILD_INFO_H diff --git a/crates/llama_cpp_sys/thirdparty/llama.cpp b/crates/llama_cpp_sys/thirdparty/llama.cpp index d0d5de4..60cdf40 160000 --- a/crates/llama_cpp_sys/thirdparty/llama.cpp +++ b/crates/llama_cpp_sys/thirdparty/llama.cpp @@ -1 +1 @@ -Subproject commit d0d5de42e5a65865b5fddb6f5c785083539b74c3 +Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640