diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs index 8ec28fe4..e86412b5 100644 --- a/embeddings/src/main.rs +++ b/embeddings/src/main.rs @@ -106,7 +106,7 @@ fn main() -> Result<()> { // initialize the context let ctx_params = LlamaContextParams::default() - .with_n_threads_batch(std::thread::available_parallelism()?.get() as u32) + .with_n_threads_batch(std::thread::available_parallelism()?.get().try_into()?) .with_embeddings(true); let mut ctx = model diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs index 914ba918..4dffe27d 100644 --- a/examples/simple/src/main.rs +++ b/examples/simple/src/main.rs @@ -54,12 +54,12 @@ struct Args { long, help = "number of threads to use during generation (default: use all available threads)" )] - threads: Option, + threads: Option, #[arg( long, help = "number of threads to use during batch and prompt processing (default: use all available threads)" )] - threads_batch: Option, + threads_batch: Option, #[arg( short = 'c', long, diff --git a/examples/usage/src/main.rs b/examples/usage/src/main.rs index 3bc6f78f..437ff928 100644 --- a/examples/usage/src/main.rs +++ b/examples/usage/src/main.rs @@ -1,9 +1,13 @@ -/* -git clone --recursive https://github.com/utilityai/llama-cpp-rs -cd llama-cpp-rs/examples/usage -wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf -cargo run qwen2-1_5b-instruct-q4_0.gguf -*/ +//! # Usage +//! +//! This is just about the smallest possible way to do inference. To fetch a model from hugging face: +//! +//! ```bash +//! git clone --recursive https://github.com/utilityai/llama-cpp-rs +//! cd llama-cpp-rs/examples/usage +//! wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf +//! cargo run --bin usage -- qwen2-1_5b-instruct-q4_0.gguf +//! ``` use std::io::Write; use llama_cpp_2::context::params::LlamaContextParams; use llama_cpp_2::llama_backend::LlamaBackend; @@ -13,6 +17,7 @@ use llama_cpp_2::model::LlamaModel; use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::token::data_array::LlamaTokenDataArray; +#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] fn main() { let model_path = std::env::args().nth(1).expect("Please specify model path"); let backend = LlamaBackend::init().unwrap(); @@ -28,14 +33,14 @@ fn main() { .expect("unable to create the llama_context"); let tokens_list = model .str_to_token(&prompt, AddBos::Always) - .expect(&format!("failed to tokenize {prompt}")); + .unwrap_or_else(|_| panic!("failed to tokenize {prompt}")); let n_len = 64; // create a llama_batch with size 512 // we use this object to submit token data for decoding let mut batch = LlamaBatch::new(512, 1); - let last_index: i32 = (tokens_list.len() - 1) as i32; + let last_index = tokens_list.len() as i32 - 1; for (i, token) in (0_i32..).zip(tokens_list.into_iter()) { // llama_decode will output logits only for the last token of the prompt let is_last = i == last_index; diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs index 19392eb9..93675f8a 100644 --- a/llama-cpp-2/src/context/params.rs +++ b/llama-cpp-2/src/context/params.rs @@ -262,7 +262,7 @@ impl LlamaContextParams { /// assert_eq!(params.n_threads(), 4); /// ``` #[must_use] - pub fn n_threads(&self) -> u32 { + pub fn n_threads(&self) -> i32 { self.context_params.n_threads } @@ -275,7 +275,7 @@ impl LlamaContextParams { /// assert_eq!(params.n_threads_batch(), 4); /// ``` #[must_use] - pub fn n_threads_batch(&self) -> u32 { + pub fn n_threads_batch(&self) -> i32 { self.context_params.n_threads_batch } @@ -290,7 +290,7 @@ impl LlamaContextParams { /// assert_eq!(params.n_threads(), 8); /// ``` #[must_use] - pub fn with_n_threads(mut self, n_threads: u32) -> Self { + pub fn with_n_threads(mut self, n_threads: i32) -> Self { self.context_params.n_threads = n_threads; self } @@ -306,7 +306,7 @@ impl LlamaContextParams { /// assert_eq!(params.n_threads_batch(), 8); /// ``` #[must_use] - pub fn with_n_threads_batch(mut self, n_threads: u32) -> Self { + pub fn with_n_threads_batch(mut self, n_threads: i32) -> Self { self.context_params.n_threads_batch = n_threads; self } @@ -354,9 +354,9 @@ impl LlamaContextParams { /// } /// /// use llama_cpp_2::context::params::LlamaContextParams; - /// let params = LlamaContextParams::default(); - /// params.with_cb_eval(Some(cb_eval_fn)); + /// let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn)); /// ``` + #[must_use] pub fn with_cb_eval( mut self, cb_eval: llama_cpp_sys_2::ggml_backend_sched_eval_callback, @@ -373,8 +373,9 @@ impl LlamaContextParams { /// use llama_cpp_2::context::params::LlamaContextParams; /// let params = LlamaContextParams::default(); /// let user_data = std::ptr::null_mut(); - /// params.with_cb_eval_user_data(user_data); + /// let params = params.with_cb_eval_user_data(user_data); /// ``` + #[must_use] pub fn with_cb_eval_user_data(mut self, cb_eval_user_data: *mut std::ffi::c_void) -> Self { self.context_params.cb_eval_user_data = cb_eval_user_data; self diff --git a/llama-cpp-2/src/grammar.rs b/llama-cpp-2/src/grammar.rs index 1f856684..667a870b 100644 --- a/llama-cpp-2/src/grammar.rs +++ b/llama-cpp-2/src/grammar.rs @@ -294,7 +294,7 @@ impl ParseState { type_: gre_type, value: c as _, }); - if rest.starts_with("-") && rest.get(1..).is_some_and(|r| !r.starts_with("]")) { + if rest.starts_with('-') && rest.get(1..).is_some_and(|r| !r.starts_with(']')) { let (c, r) = Self::parse_char(&rest[1..])?; rest = r; rule.push(llama_grammar_element { diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs index dd032fc4..38965200 100644 --- a/llama-cpp-2/src/model.rs +++ b/llama-cpp-2/src/model.rs @@ -333,7 +333,7 @@ impl LlamaModel { let len = string.as_bytes().len(); let len = c_int::try_from(len).expect("length fits into c_int"); let buf = string.into_raw(); - let lstrip = lstrip.map(|it| i32::from(it.get())).unwrap_or(0); + let lstrip = lstrip.map_or(0, |it| i32::from(it.get())); let size = unsafe { llama_cpp_sys_2::llama_token_to_piece( self.model.as_ptr(), diff --git a/llama-cpp-2/src/token_type.rs b/llama-cpp-2/src/token_type.rs index c36e4f2e..47eaf287 100644 --- a/llama-cpp-2/src/token_type.rs +++ b/llama-cpp-2/src/token_type.rs @@ -20,7 +20,7 @@ pub enum LlamaTokenAttr { SingleWord = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_SINGLE_WORD as _, } -/// A set of LlamaTokenAttrs +/// A set of `LlamaTokenAttrs` #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct LlamaTokenAttrs(pub BitFlags); @@ -42,7 +42,7 @@ impl TryFrom for LlamaTokenAttrs { type Error = LlamaTokenTypeFromIntError; fn try_from(value: llama_cpp_sys_2::llama_vocab_type) -> Result { - Ok(Self(BitFlags::from_bits(value as u32).map_err(|e| { + Ok(Self(BitFlags::from_bits(value).map_err(|e| { LlamaTokenTypeFromIntError::UnknownValue(e.invalid_bits()) })?)) } diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp index 7d787ed9..cddae488 160000 --- a/llama-cpp-sys-2/llama.cpp +++ b/llama-cpp-sys-2/llama.cpp @@ -1 +1 @@ -Subproject commit 7d787ed96c32be18603c158ab0276992cf0dc346 +Subproject commit cddae4884c853b1a7ab420458236d666e2e34423