From e94e54b416910db989fad6f47616cc6d9a8edde6 Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Mon, 5 Feb 2024 00:23:19 +0000 Subject: [PATCH 1/3] updated llama.cpp --- llama-cpp-sys-2/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp index 19122117..9392ebd4 160000 --- a/llama-cpp-sys-2/llama.cpp +++ b/llama-cpp-sys-2/llama.cpp @@ -1 +1 @@ -Subproject commit 191221178f51b6e81122c5bda0fd79620e547d07 +Subproject commit 9392ebd49ea5ae236a55b47cbf6a13247e8a3b8c From c63113377c75507677e06165e3008b91c00c5b22 Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Mon, 5 Feb 2024 07:56:28 -0800 Subject: [PATCH 2/3] updated type for RopeScalingType + fmt --- llama-cpp-2/examples/simple.rs | 43 +++++++++++++++++-------------- llama-cpp-2/src/context/params.rs | 8 +++--- llama-cpp-2/src/llama_batch.rs | 17 ++++++++---- llama-cpp-2/src/model.rs | 4 +-- llama-cpp-2/src/token.rs | 2 +- llama-cpp-sys-2/build.rs | 17 +++++++----- 6 files changed, 53 insertions(+), 38 deletions(-) diff --git a/llama-cpp-2/examples/simple.rs b/llama-cpp-2/examples/simple.rs index 37feb722..943e4b10 100644 --- a/llama-cpp-2/examples/simple.rs +++ b/llama-cpp-2/examples/simple.rs @@ -1,21 +1,20 @@ //! This is an translation of simple.cpp in llama.cpp using llama-cpp-2. #![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] -use std::io::Write; -use std::num::NonZeroU32; -use std::path::PathBuf; -use std::time::Duration; +use anyhow::{bail, Context, Result}; use clap::Parser; use llama_cpp_2::context::params::LlamaContextParams; -use llama_cpp_2::llama_backend::LlamaBackend; -use llama_cpp_2::model::LlamaModel; -use llama_cpp_2::model::params::LlamaModelParams; -use anyhow::{bail, Context, Result}; use llama_cpp_2::ggml_time_us; +use llama_cpp_2::llama_backend::LlamaBackend; use llama_cpp_2::llama_batch::LlamaBatch; -use llama_cpp_2::token::data_array::LlamaTokenDataArray; +use llama_cpp_2::model::params::LlamaModelParams; use llama_cpp_2::model::AddBos; - +use llama_cpp_2::model::LlamaModel; +use llama_cpp_2::token::data_array::LlamaTokenDataArray; +use std::io::Write; +use std::num::NonZeroU32; +use std::path::PathBuf; +use std::time::Duration; #[derive(clap::Parser)] struct Args { @@ -30,7 +29,6 @@ struct Args { disable_gpu: bool, } - fn main() -> Result<()> { let params = Args::parse(); @@ -60,12 +58,14 @@ fn main() -> Result<()> { .with_n_ctx(NonZeroU32::new(2048)) .with_seed(1234); - let mut ctx = model.new_context(&backend, ctx_params) + let mut ctx = model + .new_context(&backend, ctx_params) .with_context(|| "unable to create the llama_context")?; // tokenize the prompt - let tokens_list = model.str_to_token(¶ms.prompt, AddBos::Always) + let tokens_list = model + .str_to_token(¶ms.prompt, AddBos::Always) .with_context(|| format!("failed to tokenize {}", params.prompt))?; let n_cxt = ctx.n_ctx() as i32; @@ -75,8 +75,10 @@ fn main() -> Result<()> { // make sure the KV cache is big enough to hold all the prompt and generated tokens if n_kv_req > n_cxt { - bail!("n_kv_req > n_ctx, the required kv cache size is not big enough -either reduce n_len or increase n_ctx") + bail!( + "n_kv_req > n_ctx, the required kv cache size is not big enough +either reduce n_len or increase n_ctx" + ) } // print the prompt token-by-token @@ -137,7 +139,6 @@ either reduce n_len or increase n_ctx") ctx.decode(&mut batch).with_context(|| "failed to eval")?; n_decode += 1; - } eprintln!("\n"); @@ -146,10 +147,14 @@ either reduce n_len or increase n_ctx") let duration = Duration::from_micros((t_main_end - t_main_start) as u64); - eprintln!("decoded {} tokens in {:.2} s, speed {:.2} t/s\n", n_decode, duration.as_secs_f32(), n_decode as f32 / duration.as_secs_f32()); + eprintln!( + "decoded {} tokens in {:.2} s, speed {:.2} t/s\n", + n_decode, + duration.as_secs_f32(), + n_decode as f32 / duration.as_secs_f32() + ); println!("{}", ctx.timings()); Ok(()) - -} \ No newline at end of file +} diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs index ac6350a0..b69e5309 100644 --- a/llama-cpp-2/src/context/params.rs +++ b/llama-cpp-2/src/context/params.rs @@ -19,8 +19,8 @@ pub enum RopeScalingType { /// Create a `RopeScalingType` from a `c_int` - returns `RopeScalingType::ScalingUnspecified` if /// the value is not recognized. -impl From for RopeScalingType { - fn from(value: i8) -> Self { +impl From for RopeScalingType { + fn from(value: i32) -> Self { match value { 0 => Self::None, 1 => Self::Linear, @@ -31,7 +31,7 @@ impl From for RopeScalingType { } /// Create a `c_int` from a `RopeScalingType`. -impl From for i8 { +impl From for i32 { fn from(value: RopeScalingType) -> Self { match value { RopeScalingType::None => 0, @@ -172,7 +172,7 @@ impl LlamaContextParams { /// assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear); /// ``` pub fn with_rope_scaling_type(mut self, rope_scaling_type: RopeScalingType) -> Self { - self.context_params.rope_scaling_type = i8::from(rope_scaling_type); + self.context_params.rope_scaling_type = i32::from(rope_scaling_type); self } diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs index d553123e..c2232e12 100644 --- a/llama-cpp-2/src/llama_batch.rs +++ b/llama-cpp-2/src/llama_batch.rs @@ -44,8 +44,10 @@ impl LlamaBatch { seq_ids: &[i32], logits: bool, ) -> Result<(), BatchAddError> { - if self.allocated < usize::try_from(self.n_tokens() + 1).expect("cannot fit n_tokens into a usize") { - return Err(BatchAddError::InsufficientSpace(self.allocated)) + if self.allocated + < usize::try_from(self.n_tokens() + 1).expect("cannot fit n_tokens into a usize") + { + return Err(BatchAddError::InsufficientSpace(self.allocated)); } let offset = self.llama_batch.n_tokens; let offset_usize = usize::try_from(offset).expect("cannot fit n_tokens into a usize"); @@ -55,8 +57,10 @@ impl LlamaBatch { // batch.pos [batch.n_tokens] = pos, self.llama_batch.pos.add(offset_usize).write(pos); // batch.n_seq_id[batch.n_tokens] = seq_ids.size(); - self.llama_batch.n_seq_id.add(offset_usize).write(llama_seq_id::try_from(seq_ids.len()) - .expect("cannot fit seq_ids.len() into a llama_seq_id")); + self.llama_batch.n_seq_id.add(offset_usize).write( + llama_seq_id::try_from(seq_ids.len()) + .expect("cannot fit seq_ids.len() into a llama_seq_id"), + ); // for (size_t i = 0; i < seq_ids.size(); ++i) { // batch.seq_id[batch.n_tokens][i] = seq_ids[i]; // } @@ -65,7 +69,10 @@ impl LlamaBatch { tmp.add(i).write(*seq_id); } // batch.logits [batch.n_tokens] = logits; - self.llama_batch.logits.add(offset_usize).write(i8::from(logits)); + self.llama_batch + .logits + .add(offset_usize) + .write(i8::from(logits)); } if logits { diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs index 0e50c19d..01c33f2b 100644 --- a/llama-cpp-2/src/model.rs +++ b/llama-cpp-2/src/model.rs @@ -126,7 +126,7 @@ impl LlamaModel { ) -> Result, StringToTokenError> { let add_bos = match add_bos { AddBos::Always => true, - AddBos::Never => false + AddBos::Never => false, }; let tokens_estimation = std::cmp::max(8, (str.len() / 2) + usize::from(add_bos)); @@ -136,8 +136,6 @@ impl LlamaModel { let buffer_capacity = c_int::try_from(buffer.capacity()).expect("buffer capacity should fit into a c_int"); - - let size = unsafe { llama_cpp_sys_2::llama_tokenize( self.model.as_ptr(), diff --git a/llama-cpp-2/src/token.rs b/llama-cpp-2/src/token.rs index 2109ca80..3019420d 100644 --- a/llama-cpp-2/src/token.rs +++ b/llama-cpp-2/src/token.rs @@ -10,7 +10,7 @@ pub mod data_array; #[repr(transparent)] #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] #[allow(clippy::module_name_repetitions)] -pub struct LlamaToken( pub llama_cpp_sys_2::llama_token); +pub struct LlamaToken(pub llama_cpp_sys_2::llama_token); impl Display for LlamaToken { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 720bc4cb..48ef6a43 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -1,6 +1,6 @@ use std::env; -use std::path::PathBuf; use std::path::Path; +use std::path::PathBuf; fn main() { println!("cargo:rerun-if-changed=llama.cpp"); @@ -8,11 +8,15 @@ fn main() { let cublas_enabled = env::var("CARGO_FEATURE_CUBLAS").is_ok(); if !Path::new("llama.cpp/ggml.c").exists() { - panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.") + panic!("llama.cpp seems to not be populated, try running `git submodule update --init --recursive` to init.") } let mut ggml = cc::Build::new(); - let mut ggml_cuda = if cublas_enabled { Some(cc::Build::new()) } else { None }; + let mut ggml_cuda = if cublas_enabled { + Some(cc::Build::new()) + } else { + None + }; let mut llama_cpp = cc::Build::new(); ggml.cpp(false); @@ -20,7 +24,9 @@ fn main() { // https://github.com/ggerganov/llama.cpp/blob/a836c8f534ab789b02da149fbdaf7735500bff74/Makefile#L364-L368 if let Some(ggml_cuda) = &mut ggml_cuda { - for lib in ["cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt"] { + for lib in [ + "cuda", "cublas", "culibos", "cudart", "cublasLt", "pthread", "dl", "rt", + ] { println!("cargo:rustc-link-lib={}", lib); } @@ -66,8 +72,7 @@ fn main() { ggml.define("_GNU_SOURCE", None); } - ggml - .std("c17") + ggml.std("c17") .file("llama.cpp/ggml.c") .file("llama.cpp/ggml-alloc.c") .file("llama.cpp/ggml-backend.c") From aec18f1e02792a240eee0558194e305add6d3811 Mon Sep 17 00:00:00 2001 From: Marcus Dunn Date: Mon, 5 Feb 2024 08:06:03 -0800 Subject: [PATCH 3/3] clippy --- llama-cpp-2/benches/grammar_bias.rs | 4 ++-- llama-cpp-2/examples/simple.rs | 2 +- llama-cpp-2/src/context/params.rs | 34 ++++++++++++++--------------- llama-cpp-2/src/llama_batch.rs | 4 ++++ 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/llama-cpp-2/benches/grammar_bias.rs b/llama-cpp-2/benches/grammar_bias.rs index 23681ab0..78840605 100644 --- a/llama-cpp-2/benches/grammar_bias.rs +++ b/llama-cpp-2/benches/grammar_bias.rs @@ -30,9 +30,9 @@ fn criterion_benchmark(c: &mut Criterion) { .unwrap(); let backend = LlamaBackend::init().unwrap(); let model_params = LlamaModelParams::default(); - let model = LlamaModel::load_from_file(&backend, &file, &model_params).unwrap(); + let model = LlamaModel::load_from_file(&backend, file, &model_params).unwrap(); let mut ctx = model - .new_context(&backend, &LlamaContextParams::default()) + .new_context(&backend, LlamaContextParams::default()) .unwrap(); let grammar = LlamaGrammar::from_str(include_str!("../src/grammar/json.gbnf")).unwrap(); diff --git a/llama-cpp-2/examples/simple.rs b/llama-cpp-2/examples/simple.rs index 943e4b10..3632ec0f 100644 --- a/llama-cpp-2/examples/simple.rs +++ b/llama-cpp-2/examples/simple.rs @@ -1,5 +1,5 @@ //! This is an translation of simple.cpp in llama.cpp using llama-cpp-2. -#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] +#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)] use anyhow::{bail, Context, Result}; use clap::Parser; diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs index b69e5309..5e27b0e1 100644 --- a/llama-cpp-2/src/context/params.rs +++ b/llama-cpp-2/src/context/params.rs @@ -84,7 +84,7 @@ impl LlamaContextParams { /// let params = params.with_seed(1234); /// assert_eq!(params.seed(), 1234); /// ``` - pub fn with_seed(mut self, seed: u32) -> Self { + #[must_use] pub fn with_seed(mut self, seed: u32) -> Self { self.context_params.seed = seed; self } @@ -99,7 +99,7 @@ impl LlamaContextParams { /// .with_seed(1234); /// assert_eq!(params.seed(), 1234); /// ``` - pub fn seed(&self) -> u32 { + #[must_use] pub fn seed(&self) -> u32 { self.context_params.seed } @@ -114,8 +114,8 @@ impl LlamaContextParams { /// let params = params.with_n_ctx(NonZeroU32::new(2048)); /// assert_eq!(params.n_ctx(), NonZeroU32::new(2048)); /// ``` - pub fn with_n_ctx(mut self, n_ctx: Option) -> Self { - self.context_params.n_ctx = n_ctx.map_or(0, |n_ctx| n_ctx.get()); + #[must_use] pub fn with_n_ctx(mut self, n_ctx: Option) -> Self { + self.context_params.n_ctx = n_ctx.map_or(0, std::num::NonZeroU32::get); self } @@ -128,11 +128,11 @@ impl LlamaContextParams { /// ```rust /// let params = llama_cpp_2::context::params::LlamaContextParams::default(); /// assert_eq!(params.n_ctx(), std::num::NonZeroU32::new(512)); - pub fn n_ctx(&self) -> Option { + #[must_use] pub fn n_ctx(&self) -> Option { NonZeroU32::new(self.context_params.n_ctx) } - /// Set the n_batch + /// Set the `n_batch` /// /// # Examples /// @@ -143,12 +143,12 @@ impl LlamaContextParams { /// .with_n_batch(2048); /// assert_eq!(params.n_batch(), 2048); /// ``` - pub fn with_n_batch(mut self, n_batch: u32) -> Self { + #[must_use] pub fn with_n_batch(mut self, n_batch: u32) -> Self { self.context_params.n_batch = n_batch; self } - /// Get the n_batch + /// Get the `n_batch` /// /// # Examples /// @@ -157,7 +157,7 @@ impl LlamaContextParams { /// let params = LlamaContextParams::default(); /// assert_eq!(params.n_batch(), 512); /// ``` - pub fn n_batch(&self) -> u32 { + #[must_use] pub fn n_batch(&self) -> u32 { self.context_params.n_batch } @@ -171,7 +171,7 @@ impl LlamaContextParams { /// .with_rope_scaling_type(RopeScalingType::Linear); /// assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear); /// ``` - pub fn with_rope_scaling_type(mut self, rope_scaling_type: RopeScalingType) -> Self { + #[must_use] pub fn with_rope_scaling_type(mut self, rope_scaling_type: RopeScalingType) -> Self { self.context_params.rope_scaling_type = i32::from(rope_scaling_type); self } @@ -184,7 +184,7 @@ impl LlamaContextParams { /// let params = llama_cpp_2::context::params::LlamaContextParams::default(); /// assert_eq!(params.rope_scaling_type(), llama_cpp_2::context::params::RopeScalingType::Unspecified); /// ``` - pub fn rope_scaling_type(&self) -> RopeScalingType { + #[must_use] pub fn rope_scaling_type(&self) -> RopeScalingType { RopeScalingType::from(self.context_params.rope_scaling_type) } @@ -198,7 +198,7 @@ impl LlamaContextParams { /// .with_rope_freq_base(0.5); /// assert_eq!(params.rope_freq_base(), 0.5); /// ``` - pub fn with_rope_freq_base(mut self, rope_freq_base: f32) -> Self { + #[must_use] pub fn with_rope_freq_base(mut self, rope_freq_base: f32) -> Self { self.context_params.rope_freq_base = rope_freq_base; self } @@ -211,7 +211,7 @@ impl LlamaContextParams { /// let params = llama_cpp_2::context::params::LlamaContextParams::default(); /// assert_eq!(params.rope_freq_base(), 0.0); /// ``` - pub fn rope_freq_base(&self) -> f32 { + #[must_use] pub fn rope_freq_base(&self) -> f32 { self.context_params.rope_freq_base } @@ -225,7 +225,7 @@ impl LlamaContextParams { /// .with_rope_freq_scale(0.5); /// assert_eq!(params.rope_freq_scale(), 0.5); /// ``` - pub fn with_rope_freq_scale(mut self, rope_freq_scale: f32) -> Self { + #[must_use] pub fn with_rope_freq_scale(mut self, rope_freq_scale: f32) -> Self { self.context_params.rope_freq_scale = rope_freq_scale; self } @@ -238,7 +238,7 @@ impl LlamaContextParams { /// let params = llama_cpp_2::context::params::LlamaContextParams::default(); /// assert_eq!(params.rope_freq_scale(), 0.0); /// ``` - pub fn rope_freq_scale(&self) -> f32 { + #[must_use] pub fn rope_freq_scale(&self) -> f32 { self.context_params.rope_freq_scale } @@ -250,7 +250,7 @@ impl LlamaContextParams { /// let params = llama_cpp_2::context::params::LlamaContextParams::default(); /// assert_eq!(params.n_threads(), 4); /// ``` - pub fn n_threads(&self) -> u32 { + #[must_use] pub fn n_threads(&self) -> u32 { self.context_params.n_threads } @@ -264,7 +264,7 @@ impl LlamaContextParams { /// .with_n_threads(8); /// assert_eq!(params.n_threads(), 8); /// ``` - pub fn with_n_threads(mut self, n_threads: u32) -> Self { + #[must_use] pub fn with_n_threads(mut self, n_threads: u32) -> Self { self.context_params.n_threads = n_threads; self } diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs index c2232e12..0748dd85 100644 --- a/llama-cpp-2/src/llama_batch.rs +++ b/llama-cpp-2/src/llama_batch.rs @@ -37,6 +37,10 @@ impl LlamaBatch { /// /// - [`self.llama_batch.n_tokens`] does not fit into a usize /// - [`seq_ids.len()`] does not fit into a [`llama_seq_id`] + /// + /// # Errors + /// + /// returns a error if there is insufficient space in the buffer pub fn add( &mut self, LlamaToken(id): LlamaToken,