Skip to content

Commit

Permalink
Merge pull request #479 from utilityai/update-llama-cpp-2024-08-30
Browse files Browse the repository at this point in the history
Updated llama-cpp (bot)
  • Loading branch information
MarcusDunn authored Aug 30, 2024
2 parents 5c1468a + 1f77eb0 commit 153bd17
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 23 deletions.
2 changes: 1 addition & 1 deletion embeddings/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ fn main() -> Result<()> {

// initialize the context
let ctx_params = LlamaContextParams::default()
.with_n_threads_batch(std::thread::available_parallelism()?.get() as u32)
.with_n_threads_batch(std::thread::available_parallelism()?.get().try_into()?)
.with_embeddings(true);

let mut ctx = model
Expand Down
4 changes: 2 additions & 2 deletions examples/simple/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@ struct Args {
long,
help = "number of threads to use during generation (default: use all available threads)"
)]
threads: Option<u32>,
threads: Option<i32>,
#[arg(
long,
help = "number of threads to use during batch and prompt processing (default: use all available threads)"
)]
threads_batch: Option<u32>,
threads_batch: Option<i32>,
#[arg(
short = 'c',
long,
Expand Down
21 changes: 13 additions & 8 deletions examples/usage/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
/*
git clone --recursive https://github.com/utilityai/llama-cpp-rs
cd llama-cpp-rs/examples/usage
wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
cargo run qwen2-1_5b-instruct-q4_0.gguf
*/
//! # Usage
//!
//! This is just about the smallest possible way to do inference. To fetch a model from hugging face:
//!
//! ```bash
//! git clone --recursive https://github.com/utilityai/llama-cpp-rs
//! cd llama-cpp-rs/examples/usage
//! wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf
//! cargo run --bin usage -- qwen2-1_5b-instruct-q4_0.gguf
//! ```
use std::io::Write;
use llama_cpp_2::context::params::LlamaContextParams;
use llama_cpp_2::llama_backend::LlamaBackend;
Expand All @@ -13,6 +17,7 @@ use llama_cpp_2::model::LlamaModel;
use llama_cpp_2::model::{AddBos, Special};
use llama_cpp_2::token::data_array::LlamaTokenDataArray;

#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
fn main() {
let model_path = std::env::args().nth(1).expect("Please specify model path");
let backend = LlamaBackend::init().unwrap();
Expand All @@ -28,14 +33,14 @@ fn main() {
.expect("unable to create the llama_context");
let tokens_list = model
.str_to_token(&prompt, AddBos::Always)
.expect(&format!("failed to tokenize {prompt}"));
.unwrap_or_else(|_| panic!("failed to tokenize {prompt}"));
let n_len = 64;

// create a llama_batch with size 512
// we use this object to submit token data for decoding
let mut batch = LlamaBatch::new(512, 1);

let last_index: i32 = (tokens_list.len() - 1) as i32;
let last_index = tokens_list.len() as i32 - 1;
for (i, token) in (0_i32..).zip(tokens_list.into_iter()) {
// llama_decode will output logits only for the last token of the prompt
let is_last = i == last_index;
Expand Down
15 changes: 8 additions & 7 deletions llama-cpp-2/src/context/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ impl LlamaContextParams {
/// assert_eq!(params.n_threads(), 4);
/// ```
#[must_use]
pub fn n_threads(&self) -> u32 {
pub fn n_threads(&self) -> i32 {
self.context_params.n_threads
}

Expand All @@ -275,7 +275,7 @@ impl LlamaContextParams {
/// assert_eq!(params.n_threads_batch(), 4);
/// ```
#[must_use]
pub fn n_threads_batch(&self) -> u32 {
pub fn n_threads_batch(&self) -> i32 {
self.context_params.n_threads_batch
}

Expand All @@ -290,7 +290,7 @@ impl LlamaContextParams {
/// assert_eq!(params.n_threads(), 8);
/// ```
#[must_use]
pub fn with_n_threads(mut self, n_threads: u32) -> Self {
pub fn with_n_threads(mut self, n_threads: i32) -> Self {
self.context_params.n_threads = n_threads;
self
}
Expand All @@ -306,7 +306,7 @@ impl LlamaContextParams {
/// assert_eq!(params.n_threads_batch(), 8);
/// ```
#[must_use]
pub fn with_n_threads_batch(mut self, n_threads: u32) -> Self {
pub fn with_n_threads_batch(mut self, n_threads: i32) -> Self {
self.context_params.n_threads_batch = n_threads;
self
}
Expand Down Expand Up @@ -354,9 +354,9 @@ impl LlamaContextParams {
/// }
///
/// use llama_cpp_2::context::params::LlamaContextParams;
/// let params = LlamaContextParams::default();
/// params.with_cb_eval(Some(cb_eval_fn));
/// let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn));
/// ```
#[must_use]
pub fn with_cb_eval(
mut self,
cb_eval: llama_cpp_sys_2::ggml_backend_sched_eval_callback,
Expand All @@ -373,8 +373,9 @@ impl LlamaContextParams {
/// use llama_cpp_2::context::params::LlamaContextParams;
/// let params = LlamaContextParams::default();
/// let user_data = std::ptr::null_mut();
/// params.with_cb_eval_user_data(user_data);
/// let params = params.with_cb_eval_user_data(user_data);
/// ```
#[must_use]
pub fn with_cb_eval_user_data(mut self, cb_eval_user_data: *mut std::ffi::c_void) -> Self {
self.context_params.cb_eval_user_data = cb_eval_user_data;
self
Expand Down
2 changes: 1 addition & 1 deletion llama-cpp-2/src/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ impl ParseState {
type_: gre_type,
value: c as _,
});
if rest.starts_with("-") && rest.get(1..).is_some_and(|r| !r.starts_with("]")) {
if rest.starts_with('-') && rest.get(1..).is_some_and(|r| !r.starts_with(']')) {
let (c, r) = Self::parse_char(&rest[1..])?;
rest = r;
rule.push(llama_grammar_element {
Expand Down
2 changes: 1 addition & 1 deletion llama-cpp-2/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ impl LlamaModel {
let len = string.as_bytes().len();
let len = c_int::try_from(len).expect("length fits into c_int");
let buf = string.into_raw();
let lstrip = lstrip.map(|it| i32::from(it.get())).unwrap_or(0);
let lstrip = lstrip.map_or(0, |it| i32::from(it.get()));
let size = unsafe {
llama_cpp_sys_2::llama_token_to_piece(
self.model.as_ptr(),
Expand Down
4 changes: 2 additions & 2 deletions llama-cpp-2/src/token_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub enum LlamaTokenAttr {
SingleWord = llama_cpp_sys_2::LLAMA_TOKEN_ATTR_SINGLE_WORD as _,
}

/// A set of LlamaTokenAttrs
/// A set of `LlamaTokenAttrs`
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct LlamaTokenAttrs(pub BitFlags<LlamaTokenAttr>);

Expand All @@ -42,7 +42,7 @@ impl TryFrom<llama_cpp_sys_2::llama_token_type> for LlamaTokenAttrs {
type Error = LlamaTokenTypeFromIntError;

fn try_from(value: llama_cpp_sys_2::llama_vocab_type) -> Result<Self, Self::Error> {
Ok(Self(BitFlags::from_bits(value as u32).map_err(|e| {
Ok(Self(BitFlags::from_bits(value).map_err(|e| {
LlamaTokenTypeFromIntError::UnknownValue(e.invalid_bits())
})?))
}
Expand Down
2 changes: 1 addition & 1 deletion llama-cpp-sys-2/llama.cpp
Submodule llama.cpp updated 58 files
+11 −14 .devops/full-cuda.Dockerfile
+13 −11 .devops/llama-cli-cuda.Dockerfile
+16 −13 .devops/llama-server-cuda.Dockerfile
+2 −0 .devops/llama-server-intel.Dockerfile
+2 −0 .devops/llama-server-rocm.Dockerfile
+2 −0 .devops/llama-server-vulkan.Dockerfile
+2 −0 .devops/llama-server.Dockerfile
+1 −1 .ecrc
+3 −12 .github/workflows/docker.yml
+334 −23 common/common.cpp
+23 −7 common/common.h
+5,990 −6,398 common/stb_image.h
+2 −2 convert_hf_to_gguf.py
+2 −2 docs/backend/SYCL.md
+2 −2 docs/docker.md
+1 −1 examples/baby-llama/baby-llama.cpp
+2 −2 examples/benchmark/benchmark-matmult.cpp
+2 −2 examples/cvector-generator/cvector-generator.cpp
+1 −1 examples/export-lora/export-lora.cpp
+119 −6 examples/llama-bench/llama-bench.cpp
+2 −2 examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+2 −2 examples/llava/README-minicpmv2.5.md
+1 −5 examples/llava/clip.cpp
+2 −2 examples/llava/llava-cli.cpp
+1 −1 examples/llava/minicpmv-cli.cpp
+37 −0 examples/main/main.cpp
+43 −17 examples/server/README.md
+3 −3 examples/server/server.cpp
+4 −3 examples/speculative/speculative.cpp
+3 −3 flake.lock
+2 −2 ggml/include/ggml-alloc.h
+2 −0 ggml/include/ggml-backend.h
+126 −40 ggml/include/ggml.h
+1 −1 ggml/src/CMakeLists.txt
+20 −5 ggml/src/ggml-backend.c
+20 −1 ggml/src/ggml-cuda.cu
+8 −0 ggml/src/ggml-cuda/binbcast.cu
+1 −0 ggml/src/ggml-cuda/binbcast.cuh
+106 −0 ggml/src/ggml-cuda/cross-entropy-loss.cu
+5 −0 ggml/src/ggml-cuda/cross-entropy-loss.cuh
+1 −2 ggml/src/ggml-cuda/sumrows.cu
+2 −0 ggml/src/ggml-cuda/sumrows.cuh
+56 −0 ggml/src/ggml-cuda/unary.cu
+6 −0 ggml/src/ggml-cuda/unary.cuh
+61 −1 ggml/src/ggml-metal.m
+81 −1 ggml/src/ggml-metal.metal
+1 −1 ggml/src/ggml-quants.c
+62 −0 ggml/src/ggml-vulkan.cpp
+1,341 −271 ggml/src/ggml.c
+15 −0 ggml/src/vulkan-shaders/cos.comp
+15 −0 ggml/src/vulkan-shaders/sin.comp
+8 −0 ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+12 −5 include/llama.h
+1 −1 scripts/sync-ggml.last
+42 −18 src/llama.cpp
+77 −0 tests/test-backend-ops.cpp
+179 −66 tests/test-grad0.cpp
+1 −1 tests/test-rope.cpp

0 comments on commit 153bd17

Please sign in to comment.