Merge branch 'main' into 8-metal-on-mac

utilityai · Feb 12, 2024 · 870c5c8 · 870c5c8
2 parents 8a73403 + ab4da04
commit 870c5c8
Show file tree

Hide file tree

Showing 12 changed files with 123 additions and 82 deletions.
diff --git a/.github/workflows/update-toml-version.yaml b/.github/workflows/update-toml-version.yaml
@@ -17,6 +17,8 @@ jobs:
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
 
       - name: Update version in TOML files
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
           # Extract the current version from the TOML file
           CURRENT_VERSION=$(awk -F '"' '/^version/ {print $2}' llama-cpp-2/Cargo.toml)
@@ -32,5 +34,8 @@ jobs:
           git config --global user.name "GitHub Actions"
           git add llama-cpp-sys-2/Cargo.toml llama-cpp-2/Cargo.toml
           git commit -m "Bump version to $NEXT_VERSION [skip ci]"
-          # Push the changes back to the repository
-          git push origin main:$GITHUB_REF
+          # Create a branch for the changes
+          git checkout -b version-bump-$NEXT_VERSION
+          # Push the changes and create a pull request
+          git push origin version-bump-$NEXT_VERSION
+          gh pr create --base main --head version-bump-$NEXT_VERSION --title "Bump version to $NEXT_VERSION"
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,7 +11,7 @@ tracing = "0.1"
 hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
-bindgen = "0.69.2"
+bindgen = "0.69.4"
 cc = "1.0.83"
 
 [workspace.lints.rust]

diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
@@ -19,7 +19,7 @@ criterion = { workspace = true }
 pprof = { workspace = true, features = ["criterion", "flamegraph"] }
 
 # used in examples
-clap = { version = "4.4.18", features = ["derive"] }
+clap = { version = "4.5.0", features = ["derive"] }
 anyhow = "1.0.79"
 
 [[bench]]

diff --git a/llama-cpp-2/benches/grammar_bias.rs b/llama-cpp-2/benches/grammar_bias.rs
@@ -30,7 +30,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         .unwrap();
     let backend = LlamaBackend::init().unwrap();
     let model_params = LlamaModelParams::default();
-    let model = LlamaModel::load_from_file(&backend, &file, &model_params).unwrap();
+    let model = LlamaModel::load_from_file(&backend, file, &model_params).unwrap();
     let mut ctx = model
         .new_context(&backend, LlamaContextParams::default())
         .unwrap();

diff --git a/llama-cpp-2/examples/simple.rs b/llama-cpp-2/examples/simple.rs
@@ -1,21 +1,20 @@
 //! This is an translation of simple.cpp in llama.cpp using llama-cpp-2.
-#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
+#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)]
 
-use std::io::Write;
-use std::num::NonZeroU32;
-use std::path::PathBuf;
-use std::time::Duration;
+use anyhow::{bail, Context, Result};
 use clap::Parser;
 use llama_cpp_2::context::params::LlamaContextParams;
-use llama_cpp_2::llama_backend::LlamaBackend;
-use llama_cpp_2::model::LlamaModel;
-use llama_cpp_2::model::params::LlamaModelParams;
-use anyhow::{bail, Context, Result};
 use llama_cpp_2::ggml_time_us;
+use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
-use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::AddBos;
-
+use llama_cpp_2::model::LlamaModel;
+use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+use std::io::Write;
+use std::num::NonZeroU32;
+use std::path::PathBuf;
+use std::time::Duration;
 
 #[derive(clap::Parser)]
 struct Args {
@@ -30,7 +29,6 @@ struct Args {
     disable_gpu: bool,
 }
 
-
 fn main() -> Result<()> {
     let params = Args::parse();
 
@@ -60,12 +58,14 @@ fn main() -> Result<()> {
         .with_n_ctx(NonZeroU32::new(2048))
         .with_seed(1234);
 
-    let mut ctx = model.new_context(&backend, ctx_params)
+    let mut ctx = model
+        .new_context(&backend, ctx_params)
         .with_context(|| "unable to create the llama_context")?;
 
     // tokenize the prompt
 
-    let tokens_list = model.str_to_token(&params.prompt, AddBos::Always)
+    let tokens_list = model
+        .str_to_token(&params.prompt, AddBos::Always)
         .with_context(|| format!("failed to tokenize {}", params.prompt))?;
 
     let n_cxt = ctx.n_ctx() as i32;
@@ -75,8 +75,10 @@ fn main() -> Result<()> {
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if n_kv_req > n_cxt {
-        bail!("n_kv_req > n_ctx, the required kv cache size is not big enough
-either reduce n_len or increase n_ctx")
+        bail!(
+            "n_kv_req > n_ctx, the required kv cache size is not big enough
+either reduce n_len or increase n_ctx"
+        )
     }
 
     // print the prompt token-by-token
@@ -137,7 +139,6 @@ either reduce n_len or increase n_ctx")
         ctx.decode(&mut batch).with_context(|| "failed to eval")?;
 
         n_decode += 1;
-
     }
 
     eprintln!("\n");
@@ -146,10 +147,14 @@ either reduce n_len or increase n_ctx")
 
     let duration = Duration::from_micros((t_main_end - t_main_start) as u64);
 
-    eprintln!("decoded {} tokens in {:.2} s, speed {:.2} t/s\n", n_decode, duration.as_secs_f32(), n_decode as f32 / duration.as_secs_f32());
+    eprintln!(
+        "decoded {} tokens in {:.2} s, speed {:.2} t/s\n",
+        n_decode,
+        duration.as_secs_f32(),
+        n_decode as f32 / duration.as_secs_f32()
+    );
 
     println!("{}", ctx.timings());
 
     Ok(())
-
-}
+}