utilityai · MarcusDunn · Jan 19, 2024 · Jan 19, 2024 · Jan 19, 2024
diff --git a/llama-cpp-2/examples/simple.rs b/llama-cpp-2/examples/simple.rs
@@ -62,7 +62,7 @@ fn main() -> Result<()> {
         ..LlamaContextParams::default()
     };
 
-    let mut ctx = model.new_context(&backend, &ctx_params)
+    let mut ctx = model.new_context(&backend, ctx_params)
         .with_context(|| "unable to create the llama_context")?;
 
     // tokenize the prompt

diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
@@ -43,7 +43,7 @@ impl From<RopeScalingType> for i8 {
 }
 
 /// A safe wrapper around `llama_context_params`.
-#[derive(Debug, Clone, Copy, PartialEq)]
+#[derive(Debug, PartialEq)]
 #[allow(
     missing_docs,
     clippy::struct_excessive_bools,
@@ -71,6 +71,8 @@ pub struct LlamaContextParams {
     pub logits_all: bool,
     pub embedding: bool,
     pub offload_kqv: bool,
+    pub cb_eval: llama_cpp_sys_2::ggml_backend_sched_eval_callback,
+    pub cb_eval_user_data: *mut std::ffi::c_void,
 }
 
 /// Default parameters for `LlamaContext`. (as defined in llama.cpp by `llama_context_default_params`)
@@ -97,6 +99,8 @@ impl From<llama_context_params> for LlamaContextParams {
             n_threads_batch,
             rope_freq_base,
             rope_freq_scale,
+            cb_eval,
+            cb_eval_user_data,
             type_k,
             type_v,
             mul_mat_q,
@@ -131,6 +135,8 @@ impl From<llama_context_params> for LlamaContextParams {
             yarn_beta_slow,
             yarn_orig_ctx,
             offload_kqv,
+            cb_eval,
+            cb_eval_user_data,
         }
     }
 }
@@ -157,6 +163,8 @@ impl From<LlamaContextParams> for llama_context_params {
             yarn_beta_slow,
             yarn_orig_ctx,
             offload_kqv,
+            cb_eval,
+            cb_eval_user_data,
         }: LlamaContextParams,
     ) -> Self {
         llama_context_params {
@@ -179,6 +187,8 @@ impl From<LlamaContextParams> for llama_context_params {
             yarn_beta_slow,
             yarn_orig_ctx,
             offload_kqv,
+            cb_eval,
+            cb_eval_user_data,
         }
     }
-}
+}
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -309,12 +309,12 @@ impl LlamaModel {
     /// # Errors
     ///
     /// There is many ways this can fail. See [`LlamaContextLoadError`] for more information.
-    pub fn new_context<'a>(
-        &'a self,
+    pub fn new_context(
+        &self,
         _: &LlamaBackend,
-        params: &LlamaContextParams,
-    ) -> Result<LlamaContext<'a>, LlamaContextLoadError> {
-        let context_params = llama_context_params::from(*params);
+        params: LlamaContextParams,
+    ) -> Result<LlamaContext, LlamaContextLoadError> {
+        let context_params = llama_context_params::from(params);
         let context = unsafe {
             llama_cpp_sys_2::llama_new_context_with_model(self.model.as_ptr(), context_params)
         };

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
+1 −0		.gitignore
+1 −1		CMakeLists.txt
+4 −1		Makefile
+20 −5		ci/run.sh
+10 −0		common/common.cpp
+3 −0		common/common.h
+1 −1		common/sampling.h
+15 −11		convert.py
+118 −32		examples/imatrix/imatrix.cpp
+425 −103		examples/perplexity/perplexity.cpp
+9 −3		examples/server/server.cpp
+42 −2		ggml-backend.c
+11 −0		ggml-backend.h
+9 −3		ggml-cuda.cu
+16 −24		ggml-metal.m
+30 −44		ggml-quants.c
+3 −0		ggml-quants.h
+31 −17		ggml.c
+16 −10		ggml.h
+27 −16		llama.cpp
+4 −0		llama.h
+10 −0		scripts/get-hellaswag.sh
+7 −0		scripts/get-wikitext-2.sh
+10 −0		scripts/get-winogrande.sh
+1 −1		scripts/sync-ggml.last
+1 −0		tests/CMakeLists.txt
+28 −0		tests/test-autorelease.cpp
+28 −18		tests/test-backend-ops.cpp