Merge pull request #479 from utilityai/update-llama-cpp-2024-08-30

Updated llama-cpp (bot)
utilityai · Aug 30, 2024 · 153bd17 · 153bd17
2 parents 5c1468a + 1f77eb0
commit 153bd17
Show file tree

Hide file tree

Showing 8 changed files with 29 additions and 23 deletions.
diff --git a/embeddings/src/main.rs b/embeddings/src/main.rs
@@ -106,7 +106,7 @@ fn main() -> Result<()> {
 
     // initialize the context
     let ctx_params = LlamaContextParams::default()
-        .with_n_threads_batch(std::thread::available_parallelism()?.get() as u32)
+        .with_n_threads_batch(std::thread::available_parallelism()?.get().try_into()?)
         .with_embeddings(true);
 
     let mut ctx = model

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
@@ -54,12 +54,12 @@ struct Args {
         long,
         help = "number of threads to use during generation (default: use all available threads)"
     )]
-    threads: Option<u32>,
+    threads: Option<i32>,
     #[arg(
         long,
         help = "number of threads to use during batch and prompt processing (default: use all available threads)"
     )]
-    threads_batch: Option<u32>,
+    threads_batch: Option<i32>,
     #[arg(
         short = 'c',
         long,

diff --git a/examples/usage/src/main.rs b/examples/usage/src/main.rs
@@ -1,9 +1,13 @@
-/*
-git clone --recursive https://github.com/utilityai/llama-cpp-rs
-cd llama-cpp-rs/examples/usage
-wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
-cargo run qwen2-1_5b-instruct-q4_0.gguf
-*/
+//! # Usage
+//! 
+//! This is just about the smallest possible way to do inference. To fetch a model from hugging face:
+//! 
+//! ```bash
+//! git clone --recursive https://github.com/utilityai/llama-cpp-rs
+//! cd llama-cpp-rs/examples/usage
+//! wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
+//! cargo run --bin usage -- qwen2-1_5b-instruct-q4_0.gguf
+//! ```
 use std::io::Write;
 use llama_cpp_2::context::params::LlamaContextParams;
 use llama_cpp_2::llama_backend::LlamaBackend;
@@ -13,6 +17,7 @@ use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::token::data_array::LlamaTokenDataArray;
 
+#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
 fn main() {
     let model_path = std::env::args().nth(1).expect("Please specify model path");
     let backend = LlamaBackend::init().unwrap();
@@ -28,14 +33,14 @@ fn main() {
         .expect("unable to create the llama_context");
     let tokens_list = model
         .str_to_token(&prompt, AddBos::Always)
-        .expect(&format!("failed to tokenize {prompt}"));
+        .unwrap_or_else(|_| panic!("failed to tokenize {prompt}"));
     let n_len = 64;
 
     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
     let mut batch = LlamaBatch::new(512, 1);
 
-    let last_index: i32 = (tokens_list.len() - 1) as i32;
+    let last_index = tokens_list.len() as i32 - 1;
     for (i, token) in (0_i32..).zip(tokens_list.into_iter()) {
         // llama_decode will output logits only for the last token of the prompt
         let is_last = i == last_index;

diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
@@ -262,7 +262,7 @@ impl LlamaContextParams {
     /// assert_eq!(params.n_threads(), 4);
     /// ```
     #[must_use]
-    pub fn n_threads(&self) -> u32 {
+    pub fn n_threads(&self) -> i32 {
         self.context_params.n_threads
     }
 
@@ -275,7 +275,7 @@ impl LlamaContextParams {
     /// assert_eq!(params.n_threads_batch(), 4);
     /// ```
     #[must_use]
-    pub fn n_threads_batch(&self) -> u32 {
+    pub fn n_threads_batch(&self) -> i32 {
         self.context_params.n_threads_batch
     }
 
@@ -290,7 +290,7 @@ impl LlamaContextParams {
     /// assert_eq!(params.n_threads(), 8);
     /// ```
     #[must_use]
-    pub fn with_n_threads(mut self, n_threads: u32) -> Self {
+    pub fn with_n_threads(mut self, n_threads: i32) -> Self {
         self.context_params.n_threads = n_threads;
         self
     }
@@ -306,7 +306,7 @@ impl LlamaContextParams {
     /// assert_eq!(params.n_threads_batch(), 8);
     /// ```
     #[must_use]
-    pub fn with_n_threads_batch(mut self, n_threads: u32) -> Self {
+    pub fn with_n_threads_batch(mut self, n_threads: i32) -> Self {
         self.context_params.n_threads_batch = n_threads;
         self
     }
@@ -354,9 +354,9 @@ impl LlamaContextParams {
     /// }
     ///
     /// use llama_cpp_2::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// params.with_cb_eval(Some(cb_eval_fn));
+    /// let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn));
     /// ```
+    #[must_use]
     pub fn with_cb_eval(
         mut self,
         cb_eval: llama_cpp_sys_2::ggml_backend_sched_eval_callback,
@@ -373,8 +373,9 @@ impl LlamaContextParams {
     /// use llama_cpp_2::context::params::LlamaContextParams;
     /// let params = LlamaContextParams::default();
     /// let user_data = std::ptr::null_mut();
-    /// params.with_cb_eval_user_data(user_data);
+    /// let params = params.with_cb_eval_user_data(user_data);
     /// ```
+    #[must_use]
     pub fn with_cb_eval_user_data(mut self, cb_eval_user_data: *mut std::ffi::c_void) -> Self {
         self.context_params.cb_eval_user_data = cb_eval_user_data;
         self

diff --git a/llama-cpp-2/src/grammar.rs b/llama-cpp-2/src/grammar.rs
@@ -294,7 +294,7 @@ impl ParseState {
                         type_: gre_type,
                         value: c as _,
                     });
-                    if rest.starts_with("-") && rest.get(1..).is_some_and(|r| !r.starts_with("]")) {
+                    if rest.starts_with('-') && rest.get(1..).is_some_and(|r| !r.starts_with(']')) {
                         let (c, r) = Self::parse_char(&rest[1..])?;
                         rest = r;
                         rule.push(llama_grammar_element {

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -333,7 +333,7 @@ impl LlamaModel {
         let len = string.as_bytes().len();
         let len = c_int::try_from(len).expect("length fits into c_int");
         let buf = string.into_raw();
-        let lstrip = lstrip.map(|it| i32::from(it.get())).unwrap_or(0);
+        let lstrip = lstrip.map_or(0, |it| i32::from(it.get()));
         let size = unsafe {
             llama_cpp_sys_2::llama_token_to_piece(
                 self.model.as_ptr(),

diff --git a/llama-cpp-2/src/token_type.rs b/llama-cpp-2/src/token_type.rs
@@ -20,7 +20,7 @@ pub enum LlamaTokenAttr {
     SingleWord = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_SINGLE_WORD as _,
 }
 
-/// A set of LlamaTokenAttrs
+/// A set of `LlamaTokenAttrs`
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct LlamaTokenAttrs(pub BitFlags<LlamaTokenAttr>);
 
@@ -42,7 +42,7 @@ impl TryFrom<llama_cpp_sys_2::llama_token_type> for LlamaTokenAttrs {
     type Error = LlamaTokenTypeFromIntError;
 
     fn try_from(value: llama_cpp_sys_2::llama_vocab_type) -> Result<Self, Self::Error> {
-        Ok(Self(BitFlags::from_bits(value as u32).map_err(|e| {
+        Ok(Self(BitFlags::from_bits(value).map_err(|e| {
             LlamaTokenTypeFromIntError::UnknownValue(e.invalid_bits())
         })?))
     }

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
+11 −14		.devops/full-cuda.Dockerfile
+13 −11		.devops/llama-cli-cuda.Dockerfile
+16 −13		.devops/llama-server-cuda.Dockerfile
+2 −0		.devops/llama-server-intel.Dockerfile
+2 −0		.devops/llama-server-rocm.Dockerfile
+2 −0		.devops/llama-server-vulkan.Dockerfile
+2 −0		.devops/llama-server.Dockerfile
+1 −1		.ecrc
+3 −12		.github/workflows/docker.yml
+334 −23		common/common.cpp
+23 −7		common/common.h
+5,990 −6,398		common/stb_image.h
+2 −2		convert_hf_to_gguf.py
+2 −2		docs/backend/SYCL.md
+2 −2		docs/docker.md
+1 −1		examples/baby-llama/baby-llama.cpp
+2 −2		examples/benchmark/benchmark-matmult.cpp
+2 −2		examples/cvector-generator/cvector-generator.cpp
+1 −1		examples/export-lora/export-lora.cpp
+119 −6		examples/llama-bench/llama-bench.cpp
+2 −2		examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+2 −2		examples/llava/README-minicpmv2.5.md
+1 −5		examples/llava/clip.cpp
+2 −2		examples/llava/llava-cli.cpp
+1 −1		examples/llava/minicpmv-cli.cpp
+37 −0		examples/main/main.cpp
+43 −17		examples/server/README.md
+3 −3		examples/server/server.cpp
+4 −3		examples/speculative/speculative.cpp
+3 −3		flake.lock
+2 −2		ggml/include/ggml-alloc.h
+2 −0		ggml/include/ggml-backend.h
+126 −40		ggml/include/ggml.h
+1 −1		ggml/src/CMakeLists.txt
+20 −5		ggml/src/ggml-backend.c
+20 −1		ggml/src/ggml-cuda.cu
+8 −0		ggml/src/ggml-cuda/binbcast.cu
+1 −0		ggml/src/ggml-cuda/binbcast.cuh
+106 −0		ggml/src/ggml-cuda/cross-entropy-loss.cu
+5 −0		ggml/src/ggml-cuda/cross-entropy-loss.cuh
+1 −2		ggml/src/ggml-cuda/sumrows.cu
+2 −0		ggml/src/ggml-cuda/sumrows.cuh
+56 −0		ggml/src/ggml-cuda/unary.cu
+6 −0		ggml/src/ggml-cuda/unary.cuh
+61 −1		ggml/src/ggml-metal.m
+81 −1		ggml/src/ggml-metal.metal
+1 −1		ggml/src/ggml-quants.c
+62 −0		ggml/src/ggml-vulkan.cpp
+1,341 −271		ggml/src/ggml.c
+15 −0		ggml/src/vulkan-shaders/cos.comp
+15 −0		ggml/src/vulkan-shaders/sin.comp
+8 −0		ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+12 −5		include/llama.h
+1 −1		scripts/sync-ggml.last
+42 −18		src/llama.cpp
+77 −0		tests/test-backend-ops.cpp
+179 −66		tests/test-grad0.cpp
+1 −1		tests/test-rope.cpp