Skip to content

Commit

Permalink
Merge branch 'main' into 8-metal-on-mac
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcusDunn authored Feb 12, 2024
2 parents 8a73403 + ab4da04 commit 870c5c8
Show file tree
Hide file tree
Showing 12 changed files with 123 additions and 82 deletions.
9 changes: 7 additions & 2 deletions .github/workflows/update-toml-version.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ jobs:
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11

- name: Update version in TOML files
env:
GH_TOKEN: ${{ github.token }}
run: |
# Extract the current version from the TOML file
CURRENT_VERSION=$(awk -F '"' '/^version/ {print $2}' llama-cpp-2/Cargo.toml)
Expand All @@ -32,5 +34,8 @@ jobs:
git config --global user.name "GitHub Actions"
git add llama-cpp-sys-2/Cargo.toml llama-cpp-2/Cargo.toml
git commit -m "Bump version to $NEXT_VERSION [skip ci]"
# Push the changes back to the repository
git push origin main:$GITHUB_REF
# Create a branch for the changes
git checkout -b version-bump-$NEXT_VERSION
# Push the changes and create a pull request
git push origin version-bump-$NEXT_VERSION
gh pr create --base main --head version-bump-$NEXT_VERSION --title "Bump version to $NEXT_VERSION"
40 changes: 17 additions & 23 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ tracing = "0.1"
hf-hub = { version = "0.3.2" }
criterion = "0.5.1"
pprof = "0.13.0"
bindgen = "0.69.2"
bindgen = "0.69.4"
cc = "1.0.83"

[workspace.lints.rust]
Expand Down
2 changes: 1 addition & 1 deletion llama-cpp-2/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ criterion = { workspace = true }
pprof = { workspace = true, features = ["criterion", "flamegraph"] }

# used in examples
clap = { version = "4.4.18", features = ["derive"] }
clap = { version = "4.5.0", features = ["derive"] }
anyhow = "1.0.79"

[[bench]]
Expand Down
2 changes: 1 addition & 1 deletion llama-cpp-2/benches/grammar_bias.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ fn criterion_benchmark(c: &mut Criterion) {
.unwrap();
let backend = LlamaBackend::init().unwrap();
let model_params = LlamaModelParams::default();
let model = LlamaModel::load_from_file(&backend, &file, &model_params).unwrap();
let model = LlamaModel::load_from_file(&backend, file, &model_params).unwrap();
let mut ctx = model
.new_context(&backend, LlamaContextParams::default())
.unwrap();
Expand Down
45 changes: 25 additions & 20 deletions llama-cpp-2/examples/simple.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
//! This is an translation of simple.cpp in llama.cpp using llama-cpp-2.
#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)]

use std::io::Write;
use std::num::NonZeroU32;
use std::path::PathBuf;
use std::time::Duration;
use anyhow::{bail, Context, Result};
use clap::Parser;
use llama_cpp_2::context::params::LlamaContextParams;
use llama_cpp_2::llama_backend::LlamaBackend;
use llama_cpp_2::model::LlamaModel;
use llama_cpp_2::model::params::LlamaModelParams;
use anyhow::{bail, Context, Result};
use llama_cpp_2::ggml_time_us;
use llama_cpp_2::llama_backend::LlamaBackend;
use llama_cpp_2::llama_batch::LlamaBatch;
use llama_cpp_2::token::data_array::LlamaTokenDataArray;
use llama_cpp_2::model::params::LlamaModelParams;
use llama_cpp_2::model::AddBos;

use llama_cpp_2::model::LlamaModel;
use llama_cpp_2::token::data_array::LlamaTokenDataArray;
use std::io::Write;
use std::num::NonZeroU32;
use std::path::PathBuf;
use std::time::Duration;

#[derive(clap::Parser)]
struct Args {
Expand All @@ -30,7 +29,6 @@ struct Args {
disable_gpu: bool,
}


fn main() -> Result<()> {
let params = Args::parse();

Expand Down Expand Up @@ -60,12 +58,14 @@ fn main() -> Result<()> {
.with_n_ctx(NonZeroU32::new(2048))
.with_seed(1234);

let mut ctx = model.new_context(&backend, ctx_params)
let mut ctx = model
.new_context(&backend, ctx_params)
.with_context(|| "unable to create the llama_context")?;

// tokenize the prompt

let tokens_list = model.str_to_token(&params.prompt, AddBos::Always)
let tokens_list = model
.str_to_token(&params.prompt, AddBos::Always)
.with_context(|| format!("failed to tokenize {}", params.prompt))?;

let n_cxt = ctx.n_ctx() as i32;
Expand All @@ -75,8 +75,10 @@ fn main() -> Result<()> {

// make sure the KV cache is big enough to hold all the prompt and generated tokens
if n_kv_req > n_cxt {
bail!("n_kv_req > n_ctx, the required kv cache size is not big enough
either reduce n_len or increase n_ctx")
bail!(
"n_kv_req > n_ctx, the required kv cache size is not big enough
either reduce n_len or increase n_ctx"
)
}

// print the prompt token-by-token
Expand Down Expand Up @@ -137,7 +139,6 @@ either reduce n_len or increase n_ctx")
ctx.decode(&mut batch).with_context(|| "failed to eval")?;

n_decode += 1;

}

eprintln!("\n");
Expand All @@ -146,10 +147,14 @@ either reduce n_len or increase n_ctx")

let duration = Duration::from_micros((t_main_end - t_main_start) as u64);

eprintln!("decoded {} tokens in {:.2} s, speed {:.2} t/s\n", n_decode, duration.as_secs_f32(), n_decode as f32 / duration.as_secs_f32());
eprintln!(
"decoded {} tokens in {:.2} s, speed {:.2} t/s\n",
n_decode,
duration.as_secs_f32(),
n_decode as f32 / duration.as_secs_f32()
);

println!("{}", ctx.timings());

Ok(())

}
}
Loading

0 comments on commit 870c5c8

Please sign in to comment.