diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
index 8ec28fe4..e86412b5 100644
--- a/embeddings/src/main.rs
+++ b/embeddings/src/main.rs
@@ -106,7 +106,7 @@ fn main() -> Result<()> {
 
     // initialize the context
     let ctx_params = LlamaContextParams::default()
-        .with_n_threads_batch(std::thread::available_parallelism()?.get() as u32)
+        .with_n_threads_batch(std::thread::available_parallelism()?.get().try_into()?)
         .with_embeddings(true);
 
     let mut ctx = model
diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index 914ba918..4dffe27d 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -54,12 +54,12 @@ struct Args {
         long,
         help = "number of threads to use during generation (default: use all available threads)"
     )]
-    threads: Option<u32>,
+    threads: Option<i32>,
     #[arg(
         long,
         help = "number of threads to use during batch and prompt processing (default: use all available threads)"
     )]
-    threads_batch: Option<u32>,
+    threads_batch: Option<i32>,
     #[arg(
         short = 'c',
         long,
diff --git a/examples/usage/src/main.rs b/examples/usage/src/main.rs
index 3bc6f78f..437ff928 100644
--- a/examples/usage/src/main.rs
+++ b/examples/usage/src/main.rs
@@ -1,9 +1,13 @@
-/*
-git clone --recursive https://github.com/utilityai/llama-cpp-rs
-cd llama-cpp-rs/examples/usage
-wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
-cargo run qwen2-1_5b-instruct-q4_0.gguf
-*/
+//! # Usage
+//! 
+//! This is just about the smallest possible way to do inference. To fetch a model from hugging face:
+//! 
+//! ```bash
+//! git clone --recursive https://github.com/utilityai/llama-cpp-rs
+//! cd llama-cpp-rs/examples/usage
+//! wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
+//! cargo run --bin usage -- qwen2-1_5b-instruct-q4_0.gguf
+//! ```
 use std::io::Write;
 use llama_cpp_2::context::params::LlamaContextParams;
 use llama_cpp_2::llama_backend::LlamaBackend;
@@ -13,6 +17,7 @@ use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::token::data_array::LlamaTokenDataArray;
 
+#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
 fn main() {
     let model_path = std::env::args().nth(1).expect("Please specify model path");
     let backend = LlamaBackend::init().unwrap();
@@ -28,14 +33,14 @@ fn main() {
         .expect("unable to create the llama_context");
     let tokens_list = model
         .str_to_token(&prompt, AddBos::Always)
-        .expect(&format!("failed to tokenize {prompt}"));
+        .unwrap_or_else(|_| panic!("failed to tokenize {prompt}"));
     let n_len = 64;
 
     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
     let mut batch = LlamaBatch::new(512, 1);
 
-    let last_index: i32 = (tokens_list.len() - 1) as i32;
+    let last_index = tokens_list.len() as i32 - 1;
     for (i, token) in (0_i32..).zip(tokens_list.into_iter()) {
         // llama_decode will output logits only for the last token of the prompt
         let is_last = i == last_index;
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
index 19392eb9..93675f8a 100644
--- a/llama-cpp-2/src/context/params.rs
+++ b/llama-cpp-2/src/context/params.rs
@@ -262,7 +262,7 @@ impl LlamaContextParams {
     /// assert_eq!(params.n_threads(), 4);
     /// ```
     #[must_use]
-    pub fn n_threads(&self) -> u32 {
+    pub fn n_threads(&self) -> i32 {
         self.context_params.n_threads
     }
 
@@ -275,7 +275,7 @@ impl LlamaContextParams {
     /// assert_eq!(params.n_threads_batch(), 4);
     /// ```
     #[must_use]
-    pub fn n_threads_batch(&self) -> u32 {
+    pub fn n_threads_batch(&self) -> i32 {
         self.context_params.n_threads_batch
     }
 
@@ -290,7 +290,7 @@ impl LlamaContextParams {
     /// assert_eq!(params.n_threads(), 8);
     /// ```
     #[must_use]
-    pub fn with_n_threads(mut self, n_threads: u32) -> Self {
+    pub fn with_n_threads(mut self, n_threads: i32) -> Self {
         self.context_params.n_threads = n_threads;
         self
     }
@@ -306,7 +306,7 @@ impl LlamaContextParams {
     /// assert_eq!(params.n_threads_batch(), 8);
     /// ```
     #[must_use]
-    pub fn with_n_threads_batch(mut self, n_threads: u32) -> Self {
+    pub fn with_n_threads_batch(mut self, n_threads: i32) -> Self {
         self.context_params.n_threads_batch = n_threads;
         self
     }
@@ -354,9 +354,9 @@ impl LlamaContextParams {
     /// }
     ///
     /// use llama_cpp_2::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// params.with_cb_eval(Some(cb_eval_fn));
+    /// let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn));
     /// ```
+    #[must_use]
     pub fn with_cb_eval(
         mut self,
         cb_eval: llama_cpp_sys_2::ggml_backend_sched_eval_callback,
@@ -373,8 +373,9 @@ impl LlamaContextParams {
     /// use llama_cpp_2::context::params::LlamaContextParams;
     /// let params = LlamaContextParams::default();
     /// let user_data = std::ptr::null_mut();
-    /// params.with_cb_eval_user_data(user_data);
+    /// let params = params.with_cb_eval_user_data(user_data);
     /// ```
+    #[must_use]
     pub fn with_cb_eval_user_data(mut self, cb_eval_user_data: *mut std::ffi::c_void) -> Self {
         self.context_params.cb_eval_user_data = cb_eval_user_data;
         self
diff --git a/llama-cpp-2/src/grammar.rs b/llama-cpp-2/src/grammar.rs
index 1f856684..667a870b 100644
--- a/llama-cpp-2/src/grammar.rs
+++ b/llama-cpp-2/src/grammar.rs
@@ -294,7 +294,7 @@ impl ParseState {
                         type_: gre_type,
                         value: c as _,
                     });
-                    if rest.starts_with("-") && rest.get(1..).is_some_and(|r| !r.starts_with("]")) {
+                    if rest.starts_with('-') && rest.get(1..).is_some_and(|r| !r.starts_with(']')) {
                         let (c, r) = Self::parse_char(&rest[1..])?;
                         rest = r;
                         rule.push(llama_grammar_element {
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index dd032fc4..38965200 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -333,7 +333,7 @@ impl LlamaModel {
         let len = string.as_bytes().len();
         let len = c_int::try_from(len).expect("length fits into c_int");
         let buf = string.into_raw();
-        let lstrip = lstrip.map(|it| i32::from(it.get())).unwrap_or(0);
+        let lstrip = lstrip.map_or(0, |it| i32::from(it.get()));
         let size = unsafe {
             llama_cpp_sys_2::llama_token_to_piece(
                 self.model.as_ptr(),
diff --git a/llama-cpp-2/src/token_type.rs b/llama-cpp-2/src/token_type.rs
index c36e4f2e..47eaf287 100644
--- a/llama-cpp-2/src/token_type.rs
+++ b/llama-cpp-2/src/token_type.rs
@@ -20,7 +20,7 @@ pub enum LlamaTokenAttr {
     SingleWord = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_SINGLE_WORD as _,
 }
 
-/// A set of LlamaTokenAttrs
+/// A set of `LlamaTokenAttrs`
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct LlamaTokenAttrs(pub BitFlags<LlamaTokenAttr>);
 
@@ -42,7 +42,7 @@ impl TryFrom<llama_cpp_sys_2::llama_token_type> for LlamaTokenAttrs {
     type Error = LlamaTokenTypeFromIntError;
 
     fn try_from(value: llama_cpp_sys_2::llama_vocab_type) -> Result<Self, Self::Error> {
-        Ok(Self(BitFlags::from_bits(value as u32).map_err(|e| {
+        Ok(Self(BitFlags::from_bits(value).map_err(|e| {
             LlamaTokenTypeFromIntError::UnknownValue(e.invalid_bits())
         })?))
     }
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 7d787ed9..cddae488 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 7d787ed96c32be18603c158ab0276992cf0dc346
+Subproject commit cddae4884c853b1a7ab420458236d666e2e34423