diff --git a/master/404.html b/master/404.html
index c9078ccf1..b8b999987 100644
--- a/master/404.html
+++ b/master/404.html
@@ -824,6 +824,53 @@ <h1>
 
             
           
+            
+              
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+        <input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" type="checkbox" id="__nav_4_1_2_4" >
+      
+      
+      
+      
+        <label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" aria-label="Open Inference Protocol Extensions" data-md-level="4">
+        <label class="md-nav__title" for="__nav_4_1_2_4">
+          <span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/website/master/modelserving/data_plane/binary_tensor_data_extension/" class="md-nav__link">
+        Binary Tensor Data Extension
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/master/admin/kubernetes_deployment/index.html b/master/admin/kubernetes_deployment/index.html
index 7672aeb31..7ab7296d4 100644
--- a/master/admin/kubernetes_deployment/index.html
+++ b/master/admin/kubernetes_deployment/index.html
@@ -391,6 +391,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/admin/migration/index.html b/master/admin/migration/index.html
index 7bb2a3c5b..53e5990b4 100644
--- a/master/admin/migration/index.html
+++ b/master/admin/migration/index.html
@@ -381,6 +381,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/admin/modelmesh/index.html b/master/admin/modelmesh/index.html
index dad699bef..bb74bbfb3 100644
--- a/master/admin/modelmesh/index.html
+++ b/master/admin/modelmesh/index.html
@@ -381,6 +381,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/admin/serverless/kourier_networking/index.html b/master/admin/serverless/kourier_networking/index.html
index 166c07f4e..adc2fddbe 100644
--- a/master/admin/serverless/kourier_networking/index.html
+++ b/master/admin/serverless/kourier_networking/index.html
@@ -395,6 +395,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/admin/serverless/serverless/index.html b/master/admin/serverless/serverless/index.html
index 4327730e3..80ea8e812 100644
--- a/master/admin/serverless/serverless/index.html
+++ b/master/admin/serverless/serverless/index.html
@@ -401,6 +401,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/admin/serverless/servicemesh/index.html b/master/admin/serverless/servicemesh/index.html
index 364c81180..07b0071da 100644
--- a/master/admin/serverless/servicemesh/index.html
+++ b/master/admin/serverless/servicemesh/index.html
@@ -359,6 +359,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/api/api/index.html b/master/api/api/index.html
index caa7ba99c..854fc22c2 100644
--- a/master/api/api/index.html
+++ b/master/api/api/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/_index/index.html b/master/blog/_index/index.html
index 27747562a..58d9f615b 100644
--- a/master/blog/_index/index.html
+++ b/master/blog/_index/index.html
@@ -355,6 +355,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/articles/2021-09-27-kfserving-transition/index.html b/master/blog/articles/2021-09-27-kfserving-transition/index.html
index f012bd590..ec632c02b 100644
--- a/master/blog/articles/2021-09-27-kfserving-transition/index.html
+++ b/master/blog/articles/2021-09-27-kfserving-transition/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/articles/2021-10-11-KServe-0.7-release/index.html b/master/blog/articles/2021-10-11-KServe-0.7-release/index.html
index 44bc77bd4..331d9c453 100644
--- a/master/blog/articles/2021-10-11-KServe-0.7-release/index.html
+++ b/master/blog/articles/2021-10-11-KServe-0.7-release/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/articles/2022-02-18-KServe-0.8-release/index.html b/master/blog/articles/2022-02-18-KServe-0.8-release/index.html
index a948cdde4..83b566553 100644
--- a/master/blog/articles/2022-02-18-KServe-0.8-release/index.html
+++ b/master/blog/articles/2022-02-18-KServe-0.8-release/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/articles/2022-07-21-KServe-0.9-release/index.html b/master/blog/articles/2022-07-21-KServe-0.9-release/index.html
index 7cb39a78b..2459252d4 100644
--- a/master/blog/articles/2022-07-21-KServe-0.9-release/index.html
+++ b/master/blog/articles/2022-07-21-KServe-0.9-release/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/articles/2023-02-05-KServe-0.10-release/index.html b/master/blog/articles/2023-02-05-KServe-0.10-release/index.html
index bf31ac49b..a104d9d66 100644
--- a/master/blog/articles/2023-02-05-KServe-0.10-release/index.html
+++ b/master/blog/articles/2023-02-05-KServe-0.10-release/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/articles/2023-10-08-KServe-0.11-release/index.html b/master/blog/articles/2023-10-08-KServe-0.11-release/index.html
index d6e40e47e..4a47b84c4 100644
--- a/master/blog/articles/2023-10-08-KServe-0.11-release/index.html
+++ b/master/blog/articles/2023-10-08-KServe-0.11-release/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/articles/2024-05-15-KServe-0.13-release/index.html b/master/blog/articles/2024-05-15-KServe-0.13-release/index.html
index 42023c10a..650d682da 100644
--- a/master/blog/articles/2024-05-15-KServe-0.13-release/index.html
+++ b/master/blog/articles/2024-05-15-KServe-0.13-release/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/blog/articles/_index/index.html b/master/blog/articles/_index/index.html
index 5cc0c42d4..7005790fc 100644
--- a/master/blog/articles/_index/index.html
+++ b/master/blog/articles/_index/index.html
@@ -355,6 +355,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/community/adopters/index.html b/master/community/adopters/index.html
index 2e5008ba4..448186e6a 100644
--- a/master/community/adopters/index.html
+++ b/master/community/adopters/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/community/get_involved/index.html b/master/community/get_involved/index.html
index 7b7972e1f..82d4e2ea6 100644
--- a/master/community/get_involved/index.html
+++ b/master/community/get_involved/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/community/presentations/index.html b/master/community/presentations/index.html
index 71c39ce7a..6987cf101 100644
--- a/master/community/presentations/index.html
+++ b/master/community/presentations/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/developer/debug/index.html b/master/developer/debug/index.html
index 254c2195b..0bf218c45 100644
--- a/master/developer/debug/index.html
+++ b/master/developer/debug/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/developer/developer/index.html b/master/developer/developer/index.html
index e35dec4bb..ded3c14c9 100644
--- a/master/developer/developer/index.html
+++ b/master/developer/developer/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/get_started/first_isvc/index.html b/master/get_started/first_isvc/index.html
index f3ef41e83..bb4e6dae7 100644
--- a/master/get_started/first_isvc/index.html
+++ b/master/get_started/first_isvc/index.html
@@ -410,6 +410,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/get_started/index.html b/master/get_started/index.html
index ee09f99c4..44e2d605b 100644
--- a/master/get_started/index.html
+++ b/master/get_started/index.html
@@ -395,6 +395,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/get_started/swagger_ui/index.html b/master/get_started/swagger_ui/index.html
index a75d6deb0..6f4538951 100644
--- a/master/get_started/swagger_ui/index.html
+++ b/master/get_started/swagger_ui/index.html
@@ -386,6 +386,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/contributor/github/index.html b/master/help/contributor/github/index.html
index 3273f880b..16a589540 100644
--- a/master/help/contributor/github/index.html
+++ b/master/help/contributor/github/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/contributor/mkdocs-contributor-guide/index.html b/master/help/contributor/mkdocs-contributor-guide/index.html
index c504bdabd..25ee0d7c4 100644
--- a/master/help/contributor/mkdocs-contributor-guide/index.html
+++ b/master/help/contributor/mkdocs-contributor-guide/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/contributor/templates/template-blog/index.html b/master/help/contributor/templates/template-blog/index.html
index dcefa9c0c..01ba89449 100644
--- a/master/help/contributor/templates/template-blog/index.html
+++ b/master/help/contributor/templates/template-blog/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/contributor/templates/template-concept/index.html b/master/help/contributor/templates/template-concept/index.html
index 1335a7b15..ddac594e3 100644
--- a/master/help/contributor/templates/template-concept/index.html
+++ b/master/help/contributor/templates/template-concept/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/contributor/templates/template-procedure/index.html b/master/help/contributor/templates/template-procedure/index.html
index f771035d3..6864b34ab 100644
--- a/master/help/contributor/templates/template-procedure/index.html
+++ b/master/help/contributor/templates/template-procedure/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/contributor/templates/template-troubleshooting/index.html b/master/help/contributor/templates/template-troubleshooting/index.html
index 68bbb7558..2b86ea65d 100644
--- a/master/help/contributor/templates/template-troubleshooting/index.html
+++ b/master/help/contributor/templates/template-troubleshooting/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/style-guide/documenting-code/index.html b/master/help/style-guide/documenting-code/index.html
index 2aa160850..d1e95ef08 100644
--- a/master/help/style-guide/documenting-code/index.html
+++ b/master/help/style-guide/documenting-code/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/style-guide/style-and-formatting/index.html b/master/help/style-guide/style-and-formatting/index.html
index 1669dc945..71bf4fa3e 100644
--- a/master/help/style-guide/style-and-formatting/index.html
+++ b/master/help/style-guide/style-and-formatting/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/help/style-guide/voice-and-language/index.html b/master/help/style-guide/voice-and-language/index.html
index 7bfa6c78e..09fe55cd9 100644
--- a/master/help/style-guide/voice-and-language/index.html
+++ b/master/help/style-guide/voice-and-language/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/index.html b/master/index.html
index f9ce17f47..2d54cf32b 100644
--- a/master/index.html
+++ b/master/index.html
@@ -356,6 +356,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/autoscaling/autoscaling/index.html b/master/modelserving/autoscaling/autoscaling/index.html
index 674717e0e..446dbe8c9 100644
--- a/master/modelserving/autoscaling/autoscaling/index.html
+++ b/master/modelserving/autoscaling/autoscaling/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/batcher/batcher/index.html b/master/modelserving/batcher/batcher/index.html
index fb7c0bf70..f0f4ef264 100644
--- a/master/modelserving/batcher/batcher/index.html
+++ b/master/modelserving/batcher/batcher/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/certificate/kserve/index.html b/master/modelserving/certificate/kserve/index.html
index 35e87f75b..963bb610d 100644
--- a/master/modelserving/certificate/kserve/index.html
+++ b/master/modelserving/certificate/kserve/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/control_plane/index.html b/master/modelserving/control_plane/index.html
index 0a249b799..21009ba81 100644
--- a/master/modelserving/control_plane/index.html
+++ b/master/modelserving/control_plane/index.html
@@ -376,6 +376,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/data_plane/binary_tensor_data_extension/index.html b/master/modelserving/data_plane/binary_tensor_data_extension/index.html
new file mode 100644
index 000000000..3d22614e5
--- /dev/null
+++ b/master/modelserving/data_plane/binary_tensor_data_extension/index.html
@@ -0,0 +1,1643 @@
+
+<!DOCTYPE html>
+
+<html class="no-js" lang="en">
+<head>
+<meta charset="utf-8"/>
+<meta content="width=device-width,initial-scale=1" name="viewport"/>
+<meta content="KServe Documentation" name="description"/>
+<link href="https://kserve.io/website/master/modelserving/data_plane/binary_tensor_data_extension/" rel="canonical"/>
+<link href="../../../images/favicon/favicon-32x32.png" rel="icon"/>
+<meta content="mkdocs-1.6.1, mkdocs-material-8.0.5" name="generator"/>
+<title>Binary Tensor Data Extension - KServe Documentation Website</title>
+<link href="../../../assets/stylesheets/main.a617204b.min.css" rel="stylesheet"/>
+<link href="../../../assets/stylesheets/palette.9204c3b2.min.css" rel="stylesheet"/>
+<link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
+<link href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&amp;display=fallback" rel="stylesheet"/>
+<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+<link href="../../../assets/_mkdocstrings.css" rel="stylesheet"/>
+<link href="../../../stylesheets/extra.css" rel="stylesheet"/>
+<script>__md_scope=new URL("../../..",location),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+</head>
+<body data-md-color-accent="none" data-md-color-primary="none" data-md-color-scheme="" dir="ltr">
+<input autocomplete="off" class="md-toggle" data-md-toggle="drawer" id="__drawer" type="checkbox"/>
+<input autocomplete="off" class="md-toggle" data-md-toggle="search" id="__search" type="checkbox"/>
+<label class="md-overlay" for="__drawer"></label>
+<div data-md-component="skip">
+<a class="md-skip" href="#binary-tensor-data-extension">
+          Skip to content
+        </a>
+</div>
+<div data-md-component="announce">
+<aside class="md-banner">
+<div class="md-banner__inner md-grid md-typeset">
+<h1>
+<b>KServe v0.13 is Released</b>, <a href="/website/0.13/blog/articles/2024-05-15-KServe-0.13-release/">Read blog &gt;&gt;</a>
+</h1>
+</div>
+</aside>
+</div>
+<div data-md-component="outdated" hidden="">
+<aside class="md-banner md-banner--warning">
+</aside>
+</div>
+<header class="md-header md-header--lifted" data-md-component="header">
+<nav aria-label="Header" class="md-header__inner md-grid">
+<a aria-label="KServe Documentation Website" class="md-header__button md-logo" data-md-component="logo" href="../../.." title="KServe Documentation Website">
+<img alt="logo" src="../../../images/logo/kserve.png"/>
+</a>
+<label class="md-header__button md-icon" for="__drawer">
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2z"></path></svg>
+</label>
+<div class="md-header__title" data-md-component="header-title">
+<div class="md-header__ellipsis">
+<div class="md-header__topic">
+<span class="md-ellipsis">
+            KServe Documentation Website
+          </span>
+</div>
+<div class="md-header__topic" data-md-component="header-topic">
+<span class="md-ellipsis">
+            
+              Binary Tensor Data Extension
+            
+          </span>
+</div>
+</div>
+</div>
+<label class="md-header__button md-icon" for="__search">
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"></path></svg>
+</label>
+<div class="md-search" data-md-component="search" role="dialog">
+<label class="md-search__overlay" for="__search"></label>
+<div class="md-search__inner" role="search">
+<form class="md-search__form" name="search">
+<input aria-label="Search" autocapitalize="off" autocomplete="off" autocorrect="off" class="md-search__input" data-md-component="search-query" name="query" placeholder="Search" required="" spellcheck="false" type="text"/>
+<label class="md-search__icon md-icon" for="__search">
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5z"></path></svg>
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"></path></svg>
+</label>
+<nav aria-label="Search" class="md-search__options">
+<button aria-label="Clear" class="md-search__icon md-icon" tabindex="-1" type="reset">
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41z"></path></svg>
+</button>
+</nav>
+</form>
+<div class="md-search__output">
+<div class="md-search__scrollwrap" data-md-scrollfix="">
+<div class="md-search-result" data-md-component="search-result">
+<div class="md-search-result__meta">
+            Initializing search
+          </div>
+<ol class="md-search-result__list"></ol>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div class="md-header__source">
+<a class="md-source" data-md-component="source" href="https://github.com/kserve/kserve" title="Go to repository">
+<div class="md-source__icon md-icon">
+<svg viewbox="0 0 448 512" xmlns="http://www.w3.org/2000/svg"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"></path></svg>
+</div>
+<div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+</div>
+</nav>
+<nav aria-label="Tabs" class="md-tabs" data-md-component="tabs">
+<div class="md-tabs__inner md-grid">
+<ul class="md-tabs__list">
+<li class="md-tabs__item">
+<a class="md-tabs__link" href="../../..">
+      Home
+    </a>
+</li>
+<li class="md-tabs__item">
+<a class="md-tabs__link" href="../../../get_started/">
+        Getting started
+      </a>
+</li>
+<li class="md-tabs__item">
+<a class="md-tabs__link" href="../../../admin/serverless/serverless/">
+        Administration Guide
+      </a>
+</li>
+<li class="md-tabs__item">
+<a class="md-tabs__link md-tabs__link--active" href="../../control_plane/">
+        User Guide
+      </a>
+</li>
+<li class="md-tabs__item">
+<a class="md-tabs__link" href="../../../reference/api/">
+        API Reference
+      </a>
+</li>
+<li class="md-tabs__item">
+<a class="md-tabs__link" href="../../../developer/developer/">
+        Developer Guide
+      </a>
+</li>
+<li class="md-tabs__item">
+<a class="md-tabs__link" href="../../../blog/articles/2024-05-15-KServe-0.13-release/">
+        Blog
+      </a>
+</li>
+<li class="md-tabs__item">
+<a class="md-tabs__link" href="../../../community/get_involved/">
+        Community
+      </a>
+</li>
+</ul>
+</div>
+</nav>
+</header>
+<div class="md-container" data-md-component="container">
+<main class="md-main" data-md-component="main">
+<div class="md-main__inner md-grid">
+<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation">
+<div class="md-sidebar__scrollwrap">
+<div class="md-sidebar__inner">
+<nav aria-label="Navigation" class="md-nav md-nav--primary md-nav--lifted" data-md-level="0">
+<label class="md-nav__title" for="__drawer">
+<a aria-label="KServe Documentation Website" class="md-nav__button md-logo" data-md-component="logo" href="../../.." title="KServe Documentation Website">
+<img alt="logo" src="../../../images/logo/kserve.png"/>
+</a>
+    KServe Documentation Website
+  </label>
+<div class="md-nav__source">
+<a class="md-source" data-md-component="source" href="https://github.com/kserve/kserve" title="Go to repository">
+<div class="md-source__icon md-icon">
+<svg viewbox="0 0 448 512" xmlns="http://www.w3.org/2000/svg"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"></path></svg>
+</div>
+<div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+</div>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../..">
+        Home
+      </a>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_2" id="__nav_2" type="checkbox"/>
+<label class="md-nav__link" for="__nav_2">
+          Getting started
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Getting started" class="md-nav" data-md-level="1">
+<label class="md-nav__title" for="__nav_2">
+<span class="md-nav__icon md-icon"></span>
+          Getting started
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../get_started/">
+        KServe Quickstart
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../get_started/first_isvc/">
+        First InferenceService
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../get_started/swagger_ui/">
+        Interact with InferenceService Swagger UI
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3" id="__nav_3" type="checkbox"/>
+<label class="md-nav__link" for="__nav_3">
+          Administration Guide
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Administration Guide" class="md-nav" data-md-level="1">
+<label class="md-nav__title" for="__nav_3">
+<span class="md-nav__icon md-icon"></span>
+          Administration Guide
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3_1" id="__nav_3_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_3_1">
+          Install KServe
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Install KServe" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_3_1">
+<span class="md-nav__icon md-icon"></span>
+          Install KServe
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_3_1_1" id="__nav_3_1_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_3_1_1">
+          Serverless
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Serverless" class="md-nav" data-md-level="3">
+<label class="md-nav__title" for="__nav_3_1_1">
+<span class="md-nav__icon md-icon"></span>
+          Serverless
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../admin/serverless/serverless/">
+        Serverless installation
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../admin/serverless/servicemesh/">
+        Istio Service Mesh
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../admin/serverless/kourier_networking/">
+        Kourier Networking Layer
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../admin/modelmesh/">
+        ModelMesh installation
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../admin/kubernetes_deployment/">
+        Kubernetes deployment installation
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../admin/migration/">
+        Migrating from KFServing
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--active md-nav__item--nested">
+<input checked="" class="md-nav__toggle md-toggle" data-md-toggle="__nav_4" id="__nav_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4">
+          User Guide
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="User Guide" class="md-nav" data-md-level="1">
+<label class="md-nav__title" for="__nav_4">
+<span class="md-nav__icon md-icon"></span>
+          User Guide
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--active md-nav__item--nested">
+<input checked="" class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1" id="__nav_4_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1">
+          Concepts
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Concepts" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_1">
+<span class="md-nav__icon md-icon"></span>
+          Concepts
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_1" id="__nav_4_1_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_1">
+          Control Plane
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Control Plane" class="md-nav" data-md-level="3">
+<label class="md-nav__title" for="__nav_4_1_1">
+<span class="md-nav__icon md-icon"></span>
+          Control Plane
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../control_plane/">
+        Model Serving Control Plane
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--active md-nav__item--nested">
+<input checked="" class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2" id="__nav_4_1_2" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2">
+          Data Plane
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Data Plane" class="md-nav" data-md-level="3">
+<label class="md-nav__title" for="__nav_4_1_2">
+<span class="md-nav__icon md-icon"></span>
+          Data Plane
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../data_plane/">
+        Model Serving Data Plane
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../v1_protocol/">
+        V1 Inference Protocol
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../v2_protocol/">
+        Open Inference Protocol (V2 Inference Protocol)
+      </a>
+</li>
+<li class="md-nav__item md-nav__item--active md-nav__item--nested">
+<input checked="" class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--active">
+<input class="md-nav__toggle md-toggle" data-md-toggle="toc" id="__toc" type="checkbox"/>
+<label class="md-nav__link md-nav__link--active" for="__toc">
+          Binary Tensor Data Extension
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<a class="md-nav__link md-nav__link--active" href="./">
+        Binary Tensor Data Extension
+      </a>
+<nav aria-label="Table of contents" class="md-nav md-nav--secondary">
+<label class="md-nav__title" for="__toc">
+<span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="#overview">
+    Overview
+  </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="#examples">
+    Examples
+  </a>
+<nav aria-label="Examples" class="md-nav">
+<ul class="md-nav__list">
+<li class="md-nav__item">
+<a class="md-nav__link" href="#sending-and-receiving-binary-data">
+    Sending and Receiving Binary Data
+  </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="#requesting-all-the-outputs-to-be-in-binary-format">
+    Requesting All The Outputs To Be In Binary Format
+  </a>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../servingruntimes/">
+        Serving Runtimes
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_2" id="__nav_4_2" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_2">
+          Model Serving Runtimes
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Model Serving Runtimes" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_2">
+<span class="md-nav__icon md-icon"></span>
+          Model Serving Runtimes
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_2_1" id="__nav_4_2_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_2_1">
+          Supported Model Frameworks/Formats
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Supported Model Frameworks/Formats" class="md-nav" data-md-level="3">
+<label class="md-nav__title" for="__nav_4_2_1">
+<span class="md-nav__icon md-icon"></span>
+          Supported Model Frameworks/Formats
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/serving_runtime/">
+        Overview
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/tensorflow/">
+        Tensorflow
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/torchserve/">
+        PyTorch
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/sklearn/v2/">
+        Scikit-learn
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/xgboost/">
+        XGBoost
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/pmml/">
+        PMML
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/spark/">
+        Spark MLlib
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/lightgbm/">
+        LightGBM
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/paddle/">
+        Paddle
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/mlflow/v2/">
+        MLFlow
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/onnx/">
+        ONNX
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_2_2" id="__nav_4_2_2" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_2_2">
+          Multi-Framework Serving Runtimes
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Multi-Framework Serving Runtimes" class="md-nav" data-md-level="3">
+<label class="md-nav__title" for="__nav_4_2_2">
+<span class="md-nav__icon md-icon"></span>
+          Multi-Framework Serving Runtimes
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_2_2_1" id="__nav_4_2_2_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_2_2_1">
+          Nvidia Triton
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Nvidia Triton" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_2_2_1">
+<span class="md-nav__icon md-icon"></span>
+          Nvidia Triton
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/triton/torchscript/">
+        Torchscript
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/triton/bert/">
+        Tensorflow
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/triton/huggingface/">
+        Hugging Face
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/amd/">
+        AMD
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_2_3" id="__nav_4_2_3" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_2_3">
+          LLM Runtime
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="LLM Runtime" class="md-nav" data-md-level="3">
+<label class="md-nav__title" for="__nav_4_2_3">
+<span class="md-nav__icon md-icon"></span>
+          LLM Runtime
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_2_3_1" id="__nav_4_2_3_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_2_3_1">
+          Hugging Face LLM
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Hugging Face LLM" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_2_3_1">
+<span class="md-nav__icon md-icon"></span>
+          Hugging Face LLM
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/llm/huggingface/">
+        Overview
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/llm/huggingface/text_generation/">
+        Text Generation
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/llm/huggingface/text2text_generation/">
+        Text2Text Generation
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/llm/huggingface/token_classification/">
+        Token Classification
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/llm/huggingface/text_classification/">
+        Text Classification
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/llm/huggingface/fill_mask/">
+        Fill Mask
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/llm/huggingface/sdk_integration/">
+        SDK Integration
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/llm/torchserve/accelerate/">
+        TorchServe LLM
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/custom/custom_model/">
+        How to write a custom predictor
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_3" id="__nav_4_3" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_3">
+          Multi Model Serving
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Multi Model Serving" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_3">
+<span class="md-nav__icon md-icon"></span>
+          Multi Model Serving
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_3_1" id="__nav_4_3_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_3_1">
+          Overview
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Overview" class="md-nav" data-md-level="3">
+<label class="md-nav__title" for="__nav_4_3_1">
+<span class="md-nav__icon md-icon"></span>
+          Overview
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../mms/multi-model-serving/">
+        The Scalability Problem
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../mms/modelmesh/overview/">
+        ModelMesh Overview
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_4" id="__nav_4_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_4">
+          Transformers
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Transformers" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_4">
+<span class="md-nav__icon md-icon"></span>
+          Transformers
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/transformer/feast/">
+        Feast
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/transformer/torchserve_image_transformer/">
+        How to write a custom transformer
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/transformer/collocation/">
+        Collocate transformer and predictor
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_5" id="__nav_4_5" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_5">
+          Inference Graph
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Inference Graph" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_5">
+<span class="md-nav__icon md-icon"></span>
+          Inference Graph
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../inference_graph/">
+        Concept
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../inference_graph/image_pipeline/">
+        Image classification inference graph
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_6" id="__nav_4_6" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_6">
+          Model Storage
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Model Storage" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_6">
+<span class="md-nav__icon md-icon"></span>
+          Model Storage
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../storage/storagecontainers/">
+        Storage Containers
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../storage/azure/azure/">
+        Azure
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../storage/pvc/pvc/">
+        PVC
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../storage/s3/s3/">
+        S3
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../storage/oci/">
+        OCI
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../storage/uri/uri/">
+        URI
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../certificate/kserve/">
+        CA Certificate
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../storage/gcs/gcs/">
+        GCS
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_7" id="__nav_4_7" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_7">
+          Model Explainability
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Model Explainability" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_7">
+<span class="md-nav__icon md-icon"></span>
+          Model Explainability
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../explainer/explainer/">
+        Concept
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../explainer/trustyai/">
+        TrustyAI Explainer
+      </a>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_7_3" id="__nav_4_7_3" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_7_3">
+          Alibi Explainer
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Alibi Explainer" class="md-nav" data-md-level="3">
+<label class="md-nav__title" for="__nav_4_7_3">
+<span class="md-nav__icon md-icon"></span>
+          Alibi Explainer
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../explainer/alibi/cifar10/">
+        Image Explainer
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../explainer/alibi/income/">
+        Income Explainer
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../explainer/alibi/moviesentiment/">
+        Text Explainer
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../explainer/aix/mnist/aix/">
+        AIX Explainer
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_8" id="__nav_4_8" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_8">
+          Model Monitoring
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Model Monitoring" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_8">
+<span class="md-nav__icon md-icon"></span>
+          Model Monitoring
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../detect/alibi_detect/alibi_detect/">
+        Alibi Detector
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../detect/aif/germancredit/">
+        AIF Bias Detector
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../detect/art/mnist/">
+        ART Adversarial Detector
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_9" id="__nav_4_9" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_9">
+          Request Batching
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Request Batching" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_9">
+<span class="md-nav__icon md-icon"></span>
+          Request Batching
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../batcher/batcher/">
+        Inference Batcher
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_10" id="__nav_4_10" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_10">
+          Payload Logging
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Payload Logging" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_10">
+<span class="md-nav__icon md-icon"></span>
+          Payload Logging
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../logger/logger/">
+        Inference Logger
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_11" id="__nav_4_11" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_11">
+          Autoscaling
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Autoscaling" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_11">
+<span class="md-nav__icon md-icon"></span>
+          Autoscaling
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../autoscaling/autoscaling/">
+        Inference Autoscaling
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_12" id="__nav_4_12" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_12">
+          Node Scheduling
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Node Scheduling" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_12">
+<span class="md-nav__icon md-icon"></span>
+          Node Scheduling
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../nodescheduling/overview/">
+        Overview
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../nodescheduling/inferenceservicenodescheduling/">
+        InferenceService Node Scheduling
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_13" id="__nav_4_13" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_13">
+          Kafka
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Kafka" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_13">
+<span class="md-nav__icon md-icon"></span>
+          Kafka
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../kafka/kafka/">
+        Inference with Kafka event source
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_14" id="__nav_4_14" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_14">
+          Rollout Strategies
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Rollout Strategies" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_14">
+<span class="md-nav__icon md-icon"></span>
+          Rollout Strategies
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/rollout/canary/">
+        Canary
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../v1beta1/rollout/canary-example/">
+        Canary Example
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_15" id="__nav_4_15" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_15">
+          Inference Observability
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Inference Observability" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_4_15">
+<span class="md-nav__icon md-icon"></span>
+          Inference Observability
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../observability/prometheus_metrics/">
+        Prometheus Metrics
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../observability/grafana_dashboards/">
+        Grafana Dashboards
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_5" id="__nav_5" type="checkbox"/>
+<label class="md-nav__link" for="__nav_5">
+          API Reference
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="API Reference" class="md-nav" data-md-level="1">
+<label class="md-nav__title" for="__nav_5">
+<span class="md-nav__icon md-icon"></span>
+          API Reference
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../reference/api/">
+        Control Plane API
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../reference/swagger-ui/">
+        Open Inference Protocol API Spec
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../sdk_docs/sdk_doc/">
+        Python Client SDK
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../python_runtime_api/docs/">
+        Python Runtime Server SDK
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_6" id="__nav_6" type="checkbox"/>
+<label class="md-nav__link" for="__nav_6">
+          Developer Guide
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Developer Guide" class="md-nav" data-md-level="1">
+<label class="md-nav__title" for="__nav_6">
+<span class="md-nav__icon md-icon"></span>
+          Developer Guide
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../developer/developer/">
+        How to contribute
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../developer/debug/">
+        Debugging guide
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_7" id="__nav_7" type="checkbox"/>
+<label class="md-nav__link" for="__nav_7">
+          Blog
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Blog" class="md-nav" data-md-level="1">
+<label class="md-nav__title" for="__nav_7">
+<span class="md-nav__icon md-icon"></span>
+          Blog
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_7_1" id="__nav_7_1" type="checkbox"/>
+<label class="md-nav__link" for="__nav_7_1">
+          Releases
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Releases" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_7_1">
+<span class="md-nav__icon md-icon"></span>
+          Releases
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../blog/articles/2024-05-15-KServe-0.13-release/">
+        KServe 0.13 Release
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../blog/articles/2023-10-08-KServe-0.11-release/">
+        KServe 0.11 Release
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../blog/articles/2023-02-05-KServe-0.10-release/">
+        KServe 0.10 Release
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../blog/articles/2022-07-21-KServe-0.9-release/">
+        KServe 0.9 Release
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../blog/articles/2022-02-18-KServe-0.8-release/">
+        KServe 0.8 Release
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../blog/articles/2021-10-11-KServe-0.7-release/">
+        KServe 0.7 Release
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_7_2" id="__nav_7_2" type="checkbox"/>
+<label class="md-nav__link" for="__nav_7_2">
+          Articles
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Articles" class="md-nav" data-md-level="2">
+<label class="md-nav__title" for="__nav_7_2">
+<span class="md-nav__icon md-icon"></span>
+          Articles
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../blog/articles/2021-09-27-kfserving-transition/">
+        KFserving Transition
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_8" id="__nav_8" type="checkbox"/>
+<label class="md-nav__link" for="__nav_8">
+          Community
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Community" class="md-nav" data-md-level="1">
+<label class="md-nav__title" for="__nav_8">
+<span class="md-nav__icon md-icon"></span>
+          Community
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../community/get_involved/">
+        How to Get Involved
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../community/adopters/">
+        Adopters
+      </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../community/presentations/">
+        Demos and Presentations
+      </a>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</div>
+</div>
+</div>
+<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc">
+<div class="md-sidebar__scrollwrap">
+<div class="md-sidebar__inner">
+<nav aria-label="Table of contents" class="md-nav md-nav--secondary">
+<label class="md-nav__title" for="__toc">
+<span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="#overview">
+    Overview
+  </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="#examples">
+    Examples
+  </a>
+<nav aria-label="Examples" class="md-nav">
+<ul class="md-nav__list">
+<li class="md-nav__item">
+<a class="md-nav__link" href="#sending-and-receiving-binary-data">
+    Sending and Receiving Binary Data
+  </a>
+</li>
+<li class="md-nav__item">
+<a class="md-nav__link" href="#requesting-all-the-outputs-to-be-in-binary-format">
+    Requesting All The Outputs To Be In Binary Format
+  </a>
+</li>
+</ul>
+</nav>
+</li>
+</ul>
+</nav>
+</div>
+</div>
+</div>
+<div class="md-content" data-md-component="content">
+<article class="md-content__inner md-typeset">
+<a class="md-content__button md-icon" href="https://github.com/kserve/website/edit/main/docs/modelserving/data_plane/binary_tensor_data_extension.md" title="Edit this page">
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z"></path></svg>
+</a>
+<h1 id="binary-tensor-data-extension">Binary Tensor Data Extension<a class="headerlink" href="#binary-tensor-data-extension" title="Permanent link">¶</a></h1>
+<p>The Binary Tensor Data Extension allows clients to send and receive tensor data in a binary format in 
+the body of an HTTP/REST request. This extension is particularly useful for sending and receiving FP16 data as 
+there is no specific data type for a 16-bit float type in the Open Inference Protocol and large tensors 
+for high-throughput scenarios.</p>
+<h2 id="overview">Overview<a class="headerlink" href="#overview" title="Permanent link">¶</a></h2>
+<p>Tensor data represented as binary data is organized in little-endian byte order, row major, without stride or 
+padding between elements. All tensor data types are representable as binary data in the native size of the data type. 
+For BOOL type element true is a single byte with value 1 and false is a single byte with value 0. 
+For BYTES type an element is represented by a 4-byte unsigned integer giving the length followed by the actual bytes. 
+The binary data for a tensor is delivered in the HTTP body after the JSON object (see Examples).</p>
+<p>The binary tensor data extension uses parameters to indicate that an input or output tensor is communicated as binary data. </p>
+<p>The <code>binary_data_size</code> parameter is used in <code>$request_input</code> and <code>$response_output</code> to indicate that the input or output tensor is communicated as binary data:</p>
+<ul>
+<li>"binary_data_size" : int64 parameter indicating the size of the tensor binary data, in bytes.</li>
+</ul>
+<p>The <code>binary_data</code> parameter is used in <code>$request_output</code> to indicate that the output should be returned from KServe runtime 
+as binary data.</p>
+<ul>
+<li>"binary_data" : bool parameter that is true if the output should be returned as binary data and false (or not given) if the 
+  tensor should be returned as JSON.</li>
+</ul>
+<p>The <code>binary_data_output</code> parameter is used in <code>$inference_request</code> to indicate that all outputs should be returned from KServe runtime as binary data, unless overridden by "binary_data" on a specific output.</p>
+<ul>
+<li>"binary_data_output" : bool parameter that is true if all outputs should be returned as binary data and false 
+  (or not given) if the outputs should be returned as JSON. If "binary_data" is specified on an output it overrides this setting. </li>
+</ul>
+<p>When one or more tensors are communicated as binary data, the HTTP body of the request or response 
+will contain the JSON inference request or response object followed by the binary tensor data in the same order as the 
+order of the input or output tensors are specified in the JSON. </p>
+<ul>
+<li>If any binary data is present in the request or response the <code>Inference-Header-Content-Length</code> header must be provided to 
+  give the length of the JSON object, and Content-Length continues to give the full body length (as HTTP requires).</li>
+</ul>
+<h2 id="examples">Examples<a class="headerlink" href="#examples" title="Permanent link">¶</a></h2>
+<h3 id="sending-and-receiving-binary-data">Sending and Receiving Binary Data<a class="headerlink" href="#sending-and-receiving-binary-data" title="Permanent link">¶</a></h3>
+<p>For the following request the input tensors <code>input0</code> and <code>input2</code> are sent as binary data while <code>input1</code> is sent as non-binary data. Note that the <code>input0</code> and <code>input2</code> input tensors have a parameter <code>binary_data_size</code> which represents the size of the binary data. </p>
+<p>The output tensor <code>output0</code> must be returned as binary data as that is what is requested by setting the <code>binary_data</code> parameter to true. Also note that the size of the JSON part is provided in the <code>Inference-Header-Content-Length</code> and the total size of the binary data is reflected in the <code>Content-Length</code> header.</p>
+<div class="highlight"><pre><span></span><code>POST<span class="w"> </span>/v2/models/mymodel/infer<span class="w"> </span>HTTP/1.1
+Host:<span class="w"> </span>localhost:8000
+Content-Type:<span class="w"> </span>application/octet-stream
+Inference-Header-Content-Length:<span class="w"> </span>&lt;xx&gt;<span class="w"> </span><span class="c1"># Json length</span>
+Content-Length:<span class="w"> </span>&lt;xx+19&gt;<span class="w">     </span><span class="c1"># Json length + binary data length (In this case 16 + 3 = 19)</span>
+<span class="o">{</span>
+<span class="w">  </span><span class="s2">"model_name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"mymodel"</span>,
+<span class="w">  </span><span class="s2">"inputs"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span>
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"input0"</span>,
+<span class="w">      </span><span class="s2">"shape"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="m">2</span>,<span class="w"> </span><span class="m">2</span><span class="w"> </span><span class="o">]</span>,
+<span class="w">      </span><span class="s2">"datatype"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"FP16"</span>,
+<span class="w">      </span><span class="s2">"parameters"</span><span class="w"> </span>:<span class="w"> </span><span class="o">{</span>
+<span class="w">        </span><span class="s2">"binary_data_size"</span><span class="w"> </span>:<span class="w"> </span><span class="m">16</span>
+<span class="w">      </span><span class="o">}</span>
+<span class="w">    </span><span class="o">}</span>,
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"input1"</span>,
+<span class="w">      </span><span class="s2">"shape"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="m">2</span>,<span class="w"> </span><span class="m">2</span><span class="w"> </span><span class="o">]</span>,
+<span class="w">      </span><span class="s2">"datatype"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"UINT32"</span>,
+<span class="w">      </span><span class="s2">"data"</span>:<span class="w"> </span><span class="o">[[</span><span class="m">1</span>,<span class="w"> </span><span class="m">2</span><span class="o">]</span>,<span class="w"> </span><span class="o">[</span><span class="m">3</span>,<span class="w"> </span><span class="m">4</span><span class="o">]]</span>
+<span class="w">    </span><span class="o">}</span>,
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"input2"</span>,
+<span class="w">      </span><span class="s2">"shape"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="m">3</span><span class="w"> </span><span class="o">]</span>,
+<span class="w">      </span><span class="s2">"datatype"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"BOOL"</span>,
+<span class="w">      </span><span class="s2">"parameters"</span><span class="w"> </span>:<span class="w"> </span><span class="o">{</span>
+<span class="w">        </span><span class="s2">"binary_data_size"</span><span class="w"> </span>:<span class="w"> </span><span class="m">3</span>
+<span class="w">      </span><span class="o">}</span>
+<span class="w">    </span><span class="o">}</span>
+<span class="w">  </span><span class="o">]</span>,
+<span class="w">  </span><span class="s2">"outputs"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span>
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"output0"</span>,
+<span class="w">      </span><span class="s2">"parameters"</span><span class="w"> </span>:<span class="w"> </span><span class="o">{</span>
+<span class="w">        </span><span class="s2">"binary_data"</span><span class="w"> </span>:<span class="w"> </span><span class="nb">true</span>
+<span class="w">      </span><span class="o">}</span>
+<span class="w">    </span><span class="o">}</span>,
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"output1"</span>
+<span class="w">    </span><span class="o">}</span>
+<span class="w">  </span><span class="o">]</span>
+<span class="o">}</span>
+&lt;<span class="m">16</span><span class="w"> </span>bytes<span class="w"> </span>of<span class="w"> </span>data<span class="w"> </span><span class="k">for</span><span class="w"> </span>input0<span class="w"> </span>tensor&gt;
+&lt;<span class="m">3</span><span class="w"> </span>bytes<span class="w"> </span>of<span class="w"> </span>data<span class="w"> </span><span class="k">for</span><span class="w"> </span>input2<span class="w"> </span>tensor&gt;
+</code></pre></div>
+<p>Assuming the model returns a [ 3, 2 ] tensor of data type FP16 and a [2, 2] tensor of data type FP32 the following response would be returned.</p>
+<div class="highlight"><pre><span></span><code>HTTP/1.1<span class="w"> </span><span class="m">200</span><span class="w"> </span>OK
+Content-Type:<span class="w"> </span>application/octet-stream
+Inference-Header-Content-Length:<span class="w"> </span>&lt;yy&gt;<span class="w">  </span><span class="c1"># Json length</span>
+Content-Length:<span class="w"> </span>&lt;yy+16&gt;<span class="w">   </span><span class="c1"># Json length + binary data length (In this case 16)</span>
+<span class="o">{</span>
+<span class="w">  </span><span class="s2">"outputs"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span>
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"output0"</span>,
+<span class="w">      </span><span class="s2">"shape"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="m">3</span>,<span class="w"> </span><span class="m">2</span><span class="w"> </span><span class="o">]</span>,
+<span class="w">      </span><span class="s2">"datatype"</span><span class="w">  </span>:<span class="w"> </span><span class="s2">"FP16"</span>,
+<span class="w">      </span><span class="s2">"parameters"</span><span class="w"> </span>:<span class="w"> </span><span class="o">{</span>
+<span class="w">        </span><span class="s2">"binary_data_size"</span><span class="w"> </span>:<span class="w"> </span><span class="m">16</span>
+<span class="w">      </span><span class="o">}</span>
+<span class="w">    </span><span class="o">}</span>,
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"output1"</span>,
+<span class="w">      </span><span class="s2">"shape"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="m">2</span>,<span class="w"> </span><span class="m">2</span><span class="w"> </span><span class="o">]</span>,
+<span class="w">      </span><span class="s2">"datatype"</span><span class="w">  </span>:<span class="w"> </span><span class="s2">"FP32"</span>,
+<span class="w">      </span><span class="s2">"data"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[[</span><span class="m">1</span>.203,<span class="w"> </span><span class="m">5</span>.403<span class="o">]</span>,<span class="w"> </span><span class="o">[</span><span class="m">3</span>.434,<span class="w"> </span><span class="m">34</span>.234<span class="o">]]</span>
+<span class="w">    </span><span class="o">}</span>
+<span class="w">  </span><span class="o">]</span>
+<span class="o">}</span>
+&lt;<span class="m">16</span><span class="w"> </span>bytes<span class="w"> </span>of<span class="w"> </span>data<span class="w"> </span><span class="k">for</span><span class="w"> </span>output0<span class="w"> </span>tensor&gt;
+</code></pre></div>
+<div class="tabbed-set tabbed-alternate" data-tabs="1:1"><input checked="checked" id="__tabbed_1_1" name="__tabbed_1" type="radio"><div class="tabbed-labels"><label for="__tabbed_1_1">Inference Client Example</label></div>
+<div class="tabbed-content">
+<div class="tabbed-block"></div>
+</div>
+</input></div>
+<div class="highlight"><pre><span></span><code><span class="kn">from</span> <span class="nn">kserve</span> <span class="kn">import</span> <span class="n">ModelServer</span><span class="p">,</span> <span class="n">InferenceRESTClient</span><span class="p">,</span> <span class="n">InferRequest</span><span class="p">,</span> <span class="n">InferInput</span>
+<span class="kn">from</span> <span class="nn">kserve.protocol.infer_type</span> <span class="kn">import</span> <span class="n">RequestedOutput</span>
+<span class="kn">from</span> <span class="nn">kserve.inference_client</span> <span class="kn">import</span> <span class="n">RESTConfig</span>
+
+<span class="n">fp16_data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([[</span><span class="mf">1.1</span><span class="p">,</span> <span class="mf">2.22</span><span class="p">],</span> <span class="p">[</span><span class="mf">3.345</span><span class="p">,</span> <span class="mf">4.34343</span><span class="p">]],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">)</span>
+<span class="n">uint32_data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">uint32</span><span class="p">)</span>
+<span class="n">bool_data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="kc">True</span><span class="p">,</span> <span class="kc">False</span><span class="p">,</span> <span class="kc">True</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">bool</span><span class="p">)</span>
+
+<span class="c1"># Create input tensor with binary data</span>
+<span class="n">input_0</span> <span class="o">=</span> <span class="n">InferInput</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">"input_0"</span><span class="p">,</span> <span class="n">datatype</span><span class="o">=</span><span class="s2">"FP16"</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span>
+<span class="n">input_0</span><span class="o">.</span><span class="n">set_data_from_numpy</span><span class="p">(</span><span class="n">fp16_data</span><span class="p">,</span> <span class="n">binary_data</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="n">input_1</span> <span class="o">=</span> <span class="n">InferInput</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">"input_1"</span><span class="p">,</span> <span class="n">datatype</span><span class="o">=</span><span class="s2">"UINT32"</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span>
+<span class="n">input_1</span><span class="o">.</span><span class="n">set_data_from_numpy</span><span class="p">(</span><span class="n">uint32_data</span><span class="p">,</span> <span class="n">binary_data</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="n">input_2</span> <span class="o">=</span> <span class="n">InferInput</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">"input_2"</span><span class="p">,</span> <span class="n">datatype</span><span class="o">=</span><span class="s2">"BOOL"</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">[</span><span class="mi">3</span><span class="p">])</span>
+<span class="n">input_2</span><span class="o">.</span><span class="n">set_data_from_numpy</span><span class="p">(</span><span class="n">bool_data</span><span class="p">,</span> <span class="n">binary_data</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+
+<span class="c1"># Create request output</span>
+<span class="n">output_0</span> <span class="o">=</span> <span class="n">RequestedOutput</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">"output_0"</span><span class="p">,</span> <span class="n">binary_data</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="n">output_1</span> <span class="o">=</span> <span class="n">RequestedOutput</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">"output_1"</span><span class="p">,</span> <span class="n">binary_data</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+
+<span class="c1"># Create inference request</span>
+<span class="n">infer_request</span> <span class="o">=</span> <span class="n">InferRequest</span><span class="p">(</span>
+    <span class="n">model_name</span><span class="o">=</span><span class="s2">"mymodel"</span><span class="p">,</span>
+    <span class="n">request_id</span><span class="o">=</span><span class="s2">"2ja0ls9j1309"</span><span class="p">,</span>
+    <span class="n">infer_inputs</span><span class="o">=</span><span class="p">[</span><span class="n">input_0</span><span class="p">,</span> <span class="n">input_1</span><span class="p">,</span> <span class="n">input_2</span><span class="p">],</span>
+    <span class="n">requested_outputs</span><span class="o">=</span><span class="p">[</span><span class="n">output_0</span><span class="p">,</span> <span class="n">output_1</span><span class="p">],</span>
+<span class="p">)</span>
+
+<span class="c1"># Create the REST client</span>
+<span class="n">config</span> <span class="o">=</span> <span class="n">RESTConfig</span><span class="p">(</span><span class="n">verbose</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">protocol</span><span class="o">=</span><span class="s2">"v2"</span><span class="p">)</span>
+<span class="n">rest_client</span> <span class="o">=</span> <span class="n">InferenceRESTClient</span><span class="p">(</span><span class="n">config</span><span class="o">=</span><span class="n">config</span><span class="p">)</span>
+
+<span class="c1"># Send the request</span>
+<span class="n">infer_response</span> <span class="o">=</span> <span class="k">await</span> <span class="n">rest_client</span><span class="o">.</span><span class="n">infer</span><span class="p">(</span>
+          <span class="s2">"http://localhost:8000"</span><span class="p">,</span>
+          <span class="n">model_name</span><span class="o">=</span><span class="s2">"TestModel"</span><span class="p">,</span>
+          <span class="n">data</span><span class="o">=</span><span class="n">infer_request</span><span class="p">,</span>
+          <span class="n">headers</span><span class="o">=</span><span class="p">{</span><span class="s2">"Host"</span><span class="p">:</span> <span class="s2">"test-server.com"</span><span class="p">},</span>
+          <span class="n">timeout</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
+      <span class="p">)</span>
+
+<span class="c1"># Read the binary data from the response</span>
+<span class="n">output_0</span> <span class="o">=</span> <span class="n">infer_response</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+<span class="n">fp16_output</span> <span class="o">=</span> <span class="n">output_0</span><span class="o">.</span><span class="n">as_numpy</span><span class="p">()</span>
+
+<span class="c1"># Read the non-binary data from the response</span>
+<span class="n">output_1</span> <span class="o">=</span> <span class="n">infer_response</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+<span class="n">fp32_output</span> <span class="o">=</span> <span class="n">output_1</span><span class="o">.</span><span class="n">data</span> <span class="c1"># This will return the data as a list</span>
+<span class="n">fp32_output_arr</span> <span class="o">=</span> <span class="n">output_1</span><span class="o">.</span><span class="n">as_numpy</span><span class="p">()</span>
+</code></pre></div>
+<h3 id="requesting-all-the-outputs-to-be-in-binary-format">Requesting All The Outputs To Be In Binary Format<a class="headerlink" href="#requesting-all-the-outputs-to-be-in-binary-format" title="Permanent link">¶</a></h3>
+<p>For the following request, <code>binary_data_output</code> is set to true to receive all the outputs as binary data. Note that the 
+<code>binary_data_output</code> is set in the <code>$inference_request</code> parameters field, not in the <code>$inference_input</code> parameters field. This parameter can be overridden for a specific output by setting <code>binary_data</code> parameter to false in the <code>$request_output</code>.</p>
+<p><div class="highlight"><pre><span></span><code>POST<span class="w"> </span>/v2/models/mymodel/infer<span class="w"> </span>HTTP/1.1
+Host:<span class="w"> </span>localhost:8000
+Content-Type:<span class="w"> </span>application/json
+Content-Length:<span class="w"> </span><span class="m">75</span>
+<span class="o">{</span>
+<span class="w">  </span><span class="s2">"model_name"</span>:<span class="w"> </span><span class="s2">"my_model"</span>,
+<span class="w">  </span><span class="s2">"inputs"</span>:<span class="w"> </span><span class="o">[</span>
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span>:<span class="w"> </span><span class="s2">"input_tensor"</span>,
+<span class="w">      </span><span class="s2">"datatype"</span>:<span class="w"> </span><span class="s2">"FP32"</span>,
+<span class="w">      </span><span class="s2">"shape"</span>:<span class="w"> </span><span class="o">[</span><span class="m">1</span>,<span class="w"> </span><span class="m">2</span><span class="o">]</span>,
+<span class="w">      </span><span class="s2">"data"</span>:<span class="w"> </span><span class="o">[[</span><span class="m">32</span>.045,<span class="w"> </span><span class="m">399</span>.043<span class="o">]]</span>,
+<span class="w">    </span><span class="o">}</span>
+<span class="w">  </span><span class="o">]</span>,
+<span class="w">  </span><span class="s2">"parameters"</span>:<span class="w"> </span><span class="o">{</span>
+<span class="w">     </span><span class="s2">"binary_data_output"</span>:<span class="w"> </span><span class="nb">true</span>
+<span class="w">  </span><span class="o">}</span>
+<span class="o">}</span>
+</code></pre></div>
+Assuming the model returns a [ 3, 2 ] tensor of data type FP16 and a [2, 2] tensor of data type FP32 the following response would be returned.</p>
+<div class="highlight"><pre><span></span><code>HTTP/1.1<span class="w"> </span><span class="m">200</span><span class="w"> </span>OK
+Content-Type:<span class="w"> </span>application/octet-stream
+Inference-Header-Content-Length:<span class="w"> </span>&lt;yy&gt;<span class="w">  </span><span class="c1"># Json length</span>
+Content-Length:<span class="w"> </span>&lt;yy+48&gt;<span class="w">   </span><span class="c1"># Json length + binary data length (In this case 16 + 32)</span>
+<span class="o">{</span>
+<span class="w">  </span><span class="s2">"outputs"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span>
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"output_tensor0"</span>,
+<span class="w">      </span><span class="s2">"shape"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="m">3</span>,<span class="w"> </span><span class="m">2</span><span class="w"> </span><span class="o">]</span>,
+<span class="w">      </span><span class="s2">"datatype"</span><span class="w">  </span>:<span class="w"> </span><span class="s2">"FP16"</span>,
+<span class="w">      </span><span class="s2">"parameters"</span><span class="w"> </span>:<span class="w"> </span><span class="o">{</span>
+<span class="w">        </span><span class="s2">"binary_data_size"</span><span class="w"> </span>:<span class="w"> </span><span class="m">16</span>
+<span class="w">      </span><span class="o">}</span>
+<span class="w">    </span><span class="o">}</span>,
+<span class="w">    </span><span class="o">{</span>
+<span class="w">      </span><span class="s2">"name"</span><span class="w"> </span>:<span class="w"> </span><span class="s2">"output_tensor1"</span>,
+<span class="w">      </span><span class="s2">"shape"</span><span class="w"> </span>:<span class="w"> </span><span class="o">[</span><span class="w"> </span><span class="m">2</span>,<span class="w"> </span><span class="m">2</span><span class="w"> </span><span class="o">]</span>,
+<span class="w">      </span><span class="s2">"datatype"</span><span class="w">  </span>:<span class="w"> </span><span class="s2">"FP32"</span>,
+<span class="w">      </span><span class="s2">"parameters"</span>:<span class="w"> </span><span class="o">{</span>
+<span class="w">        </span><span class="s2">"binary_data_size"</span>:<span class="w"> </span><span class="m">32</span>
+<span class="w">      </span><span class="o">}</span>
+<span class="w">    </span><span class="o">}</span>
+<span class="w">  </span><span class="o">]</span>
+<span class="o">}</span>
+&lt;<span class="m">16</span><span class="w"> </span>bytes<span class="w"> </span>of<span class="w"> </span>data<span class="w"> </span><span class="k">for</span><span class="w"> </span>output_tensor0<span class="w"> </span>tensor&gt;
+&lt;<span class="m">32</span><span class="w"> </span>bytes<span class="w"> </span>of<span class="w"> </span>data<span class="w"> </span><span class="k">for</span><span class="w"> </span>output_tensor1<span class="w"> </span>tensor&gt;
+</code></pre></div>
+<div class="tabbed-set tabbed-alternate" data-tabs="2:1"><input checked="checked" id="__tabbed_2_1" name="__tabbed_2" type="radio"><div class="tabbed-labels"><label for="__tabbed_2_1">Inference Client Example</label></div>
+<div class="tabbed-content">
+<div class="tabbed-block"></div>
+</div>
+</input></div>
+<div class="highlight"><pre><span></span><code><span class="kn">from</span> <span class="nn">kserve</span> <span class="kn">import</span> <span class="n">ModelServer</span><span class="p">,</span> <span class="n">InferenceRESTClient</span><span class="p">,</span> <span class="n">InferRequest</span><span class="p">,</span> <span class="n">InferInput</span>
+<span class="kn">from</span> <span class="nn">kserve.protocol.infer_type</span> <span class="kn">import</span> <span class="n">RequestedOutput</span>
+<span class="kn">from</span> <span class="nn">kserve.inference_client</span> <span class="kn">import</span> <span class="n">RESTConfig</span>
+
+<span class="n">fp32_data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([[</span><span class="mf">32.045</span><span class="p">,</span> <span class="mf">399.043</span><span class="p">]],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+
+<span class="c1"># Create the input tensor</span>
+<span class="n">input_0</span> <span class="o">=</span> <span class="n">InferInput</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">"input_0"</span><span class="p">,</span> <span class="n">datatype</span><span class="o">=</span><span class="s2">"FP32"</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span>
+<span class="n">input_0</span><span class="o">.</span><span class="n">set_data_from_numpy</span><span class="p">(</span><span class="n">fp16_data</span><span class="p">,</span> <span class="n">binary_data</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+
+<span class="c1"># Create inference request with binary_data_output set to True</span>
+<span class="n">infer_request</span> <span class="o">=</span> <span class="n">InferRequest</span><span class="p">(</span>
+    <span class="n">model_name</span><span class="o">=</span><span class="s2">"mymodel"</span><span class="p">,</span>
+    <span class="n">request_id</span><span class="o">=</span><span class="s2">"2ja0ls9j1309"</span><span class="p">,</span>
+    <span class="n">infer_inputs</span><span class="o">=</span><span class="p">[</span><span class="n">input_0</span><span class="p">],</span>
+    <span class="n">parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"binary_data_output"</span><span class="p">:</span> <span class="kc">True</span><span class="p">}</span>
+<span class="p">)</span>
+
+<span class="c1"># Create the REST client</span>
+<span class="n">config</span> <span class="o">=</span> <span class="n">RESTConfig</span><span class="p">(</span><span class="n">verbose</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">protocol</span><span class="o">=</span><span class="s2">"v2"</span><span class="p">)</span>
+<span class="n">rest_client</span> <span class="o">=</span> <span class="n">InferenceRESTClient</span><span class="p">(</span><span class="n">config</span><span class="o">=</span><span class="n">config</span><span class="p">)</span>
+
+<span class="c1"># Send the request</span>
+<span class="n">infer_response</span> <span class="o">=</span> <span class="k">await</span> <span class="n">rest_client</span><span class="o">.</span><span class="n">infer</span><span class="p">(</span>
+                      <span class="s2">"http://localhost:8000"</span><span class="p">,</span>
+                      <span class="n">model_name</span><span class="o">=</span><span class="s2">"TestModel"</span><span class="p">,</span>
+                      <span class="n">data</span><span class="o">=</span><span class="n">infer_request</span><span class="p">,</span>
+                      <span class="n">headers</span><span class="o">=</span><span class="p">{</span><span class="s2">"Host"</span><span class="p">:</span> <span class="s2">"test-server.com"</span><span class="p">},</span>
+                      <span class="n">timeout</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
+                 <span class="p">)</span>
+
+<span class="c1"># Read the binary data from the response</span>
+<span class="n">output_0</span> <span class="o">=</span> <span class="n">infer_response</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+<span class="n">fp16_output</span> <span class="o">=</span> <span class="n">output_0</span><span class="o">.</span><span class="n">as_numpy</span><span class="p">()</span>
+<span class="n">output_1</span> <span class="o">=</span> <span class="n">infer_response</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+<span class="n">fp32_output_arr</span> <span class="o">=</span> <span class="n">output_1</span><span class="o">.</span><span class="n">as_numpy</span><span class="p">()</span>
+</code></pre></div>
+</article>
+</div>
+</div>
+<a class="md-top md-icon" data-md-component="top" data-md-state="hidden" href="#">
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path></svg>
+            Back to top
+          </a>
+</main>
+<footer class="md-footer">
+<nav aria-label="Footer" class="md-footer__inner md-grid">
+<a aria-label="Previous: Open Inference Protocol (V2 Inference Protocol)" class="md-footer__link md-footer__link--prev" href="../v2_protocol/" rel="prev">
+<div class="md-footer__button md-icon">
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"></path></svg>
+</div>
+<div class="md-footer__title">
+<div class="md-ellipsis">
+<span class="md-footer__direction">
+                Previous
+              </span>
+              Open Inference Protocol (V2 Inference Protocol)
+            </div>
+</div>
+</a>
+<a aria-label="Next: Serving Runtimes" class="md-footer__link md-footer__link--next" href="../../servingruntimes/" rel="next">
+<div class="md-footer__title">
+<div class="md-ellipsis">
+<span class="md-footer__direction">
+                Next
+              </span>
+              Serving Runtimes
+            </div>
+</div>
+<div class="md-footer__button md-icon">
+<svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M4 11v2h12l-5.5 5.5 1.42 1.42L19.84 12l-7.92-7.92L10.5 5.5 16 11H4z"></path></svg>
+</div>
+</a>
+</nav>
+<div class="md-footer-meta md-typeset">
+<div class="md-footer-meta__inner md-grid">
+<div class="md-copyright">
+<div class="md-copyright__highlight">
+      Copyright © 2021 The KServe Authors
+    </div>
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" rel="noopener" target="_blank">
+      Material for MkDocs
+    </a>
+</div>
+<div class="md-social">
+<a class="md-social__link" href="https://github.com/kserve" rel="noopener" target="_blank" title="KServe Community on Github">
+<svg viewbox="0 0 496 512" xmlns="http://www.w3.org/2000/svg"><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"></path></svg>
+</a>
+<a class="md-social__link" href="https://github.com/kserve/community/blob/main/README.md#questions-and-issues" rel="noopener" target="_blank" title="Join Slack">
+<svg viewbox="0 0 448 512" xmlns="http://www.w3.org/2000/svg"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"></path></svg>
+</a>
+</div>
+</div>
+</div>
+</footer>
+</div>
+<div class="md-dialog" data-md-component="dialog">
+<div class="md-dialog__inner md-typeset"></div>
+</div>
+<script id="__config" type="application/json">{"base": "../../..", "features": ["navigation.tabs", "navigation.tracking", "navigation.tabs.sticky", "navigation.top"], "search": "../../../assets/javascripts/workers/search.cefbb252.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.config.lang": "en", "search.config.pipeline": "trimmer, stopWordFilter", "search.config.separator": "[\\s\\-]+", "search.placeholder": "Search", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version.title": "Select version"}, "version": {"provider": "mike"}}</script>
+<script src="../../../assets/javascripts/bundle.a5f8ea78.min.js"></script>
+<script>document$.subscribe(() => {
+            window.update_swagger_ui_iframe_height = function (id) {
+                var iFrameID = document.getElementById(id);
+                if (iFrameID) {
+                    full_height = (iFrameID.contentWindow.document.body.scrollHeight + 80) + "px";
+                    iFrameID.height = full_height;
+                    iFrameID.style.height = full_height;
+                }
+            }
+        
+            let iframe_id_list = []
+            var iframes = document.getElementsByClassName("swagger-ui-iframe");
+            for (var i = 0; i < iframes.length; i++) { 
+                iframe_id_list.push(iframes[i].getAttribute("id"))
+            }
+        
+            let ticking = true;
+            
+            document.addEventListener('scroll', function(e) {
+                if (!ticking) {
+                    window.requestAnimationFrame(()=> {
+                        let half_vh = window.innerHeight/2;
+                        for(var i = 0; i < iframe_id_list.length; i++) {
+                            let element = document.getElementById(iframe_id_list[i])
+                            if(element==null){
+                                return
+                            }
+                            let diff = element.getBoundingClientRect().top
+                            if(element.contentWindow.update_top_val){
+                                element.contentWindow.update_top_val(half_vh - diff)
+                            }
+                        }
+                        ticking = false;
+                    });
+                    ticking = true;
+                }
+            });
+        
+            const dark_scheme_name = "slate"
+            
+            window.scheme = document.body.getAttribute("data-md-color-scheme")
+            const options = {
+                attributeFilter: ['data-md-color-scheme'],
+            };
+            function color_scheme_callback(mutations) {
+                for (let mutation of mutations) {
+                    if (mutation.attributeName === "data-md-color-scheme") {
+                        scheme = document.body.getAttribute("data-md-color-scheme")
+                        var iframe_list = document.getElementsByClassName("swagger-ui-iframe")
+                        for(var i = 0; i < iframe_list.length; i++) {
+                            var ele = iframe_list.item(i);
+                            if (ele) {
+                                if (scheme === dark_scheme_name) {
+                                    ele.contentWindow.enable_dark_mode();
+                                } else {
+                                    ele.contentWindow.disable_dark_mode();
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            observer = new MutationObserver(color_scheme_callback);
+            observer.observe(document.body, options);
+            })</script></body>
+</html>
\ No newline at end of file
diff --git a/master/modelserving/data_plane/data_plane/index.html b/master/modelserving/data_plane/data_plane/index.html
index 42ea4d41b..df51c4f73 100644
--- a/master/modelserving/data_plane/data_plane/index.html
+++ b/master/modelserving/data_plane/data_plane/index.html
@@ -409,6 +409,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/data_plane/v1_protocol/index.html b/master/modelserving/data_plane/v1_protocol/index.html
index 9f4b9be5c..5135aa753 100644
--- a/master/modelserving/data_plane/v1_protocol/index.html
+++ b/master/modelserving/data_plane/v1_protocol/index.html
@@ -376,6 +376,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/data_plane/v2_protocol/index.html b/master/modelserving/data_plane/v2_protocol/index.html
index a434f644a..9705cf87a 100644
--- a/master/modelserving/data_plane/v2_protocol/index.html
+++ b/master/modelserving/data_plane/v2_protocol/index.html
@@ -655,6 +655,26 @@ <h1>
 </ul>
 </nav>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
@@ -2803,13 +2823,13 @@ <h4 id="tensor-data-types_1">Tensor Data Types<a class="headerlink" href="#tenso
             </div>
 </div>
 </a>
-<a aria-label="Next: Serving Runtimes" class="md-footer__link md-footer__link--next" href="../../servingruntimes/" rel="next">
+<a aria-label="Next: Binary Tensor Data Extension" class="md-footer__link md-footer__link--next" href="../binary_tensor_data_extension/" rel="next">
 <div class="md-footer__title">
 <div class="md-ellipsis">
 <span class="md-footer__direction">
                 Next
               </span>
-              Serving Runtimes
+              Binary Tensor Data Extension
             </div>
 </div>
 <div class="md-footer__button md-icon">
diff --git a/master/modelserving/detect/aif/germancredit/index.html b/master/modelserving/detect/aif/germancredit/index.html
index 1fe352e30..c6e4677ec 100644
--- a/master/modelserving/detect/aif/germancredit/index.html
+++ b/master/modelserving/detect/aif/germancredit/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/detect/aif/germancredit/server/index.html b/master/modelserving/detect/aif/germancredit/server/index.html
index fedb36584..1b97d361d 100644
--- a/master/modelserving/detect/aif/germancredit/server/index.html
+++ b/master/modelserving/detect/aif/germancredit/server/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/detect/alibi_detect/alibi_detect/index.html b/master/modelserving/detect/alibi_detect/alibi_detect/index.html
index 5493069eb..73813d836 100644
--- a/master/modelserving/detect/alibi_detect/alibi_detect/index.html
+++ b/master/modelserving/detect/alibi_detect/alibi_detect/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/detect/art/mnist/index.html b/master/modelserving/detect/art/mnist/index.html
index bb559d32b..551749f39 100644
--- a/master/modelserving/detect/art/mnist/index.html
+++ b/master/modelserving/detect/art/mnist/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/explainer/aix/mnist/aix/index.html b/master/modelserving/explainer/aix/mnist/aix/index.html
index 3913729f4..ff43afa7f 100644
--- a/master/modelserving/explainer/aix/mnist/aix/index.html
+++ b/master/modelserving/explainer/aix/mnist/aix/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/explainer/alibi/cifar10/index.html b/master/modelserving/explainer/alibi/cifar10/index.html
index cd49783a3..d60f2c0c3 100644
--- a/master/modelserving/explainer/alibi/cifar10/index.html
+++ b/master/modelserving/explainer/alibi/cifar10/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/explainer/alibi/income/index.html b/master/modelserving/explainer/alibi/income/index.html
index a68ae3e28..60e17270c 100644
--- a/master/modelserving/explainer/alibi/income/index.html
+++ b/master/modelserving/explainer/alibi/income/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/explainer/alibi/moviesentiment/index.html b/master/modelserving/explainer/alibi/moviesentiment/index.html
index ea2d8fe40..727c7476d 100644
--- a/master/modelserving/explainer/alibi/moviesentiment/index.html
+++ b/master/modelserving/explainer/alibi/moviesentiment/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/explainer/explainer/index.html b/master/modelserving/explainer/explainer/index.html
index bc6835c9e..ad27d432d 100644
--- a/master/modelserving/explainer/explainer/index.html
+++ b/master/modelserving/explainer/explainer/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/explainer/trustyai/index.html b/master/modelserving/explainer/trustyai/index.html
index 9511bee3d..80e27d263 100644
--- a/master/modelserving/explainer/trustyai/index.html
+++ b/master/modelserving/explainer/trustyai/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/inference_graph/image_pipeline/index.html b/master/modelserving/inference_graph/image_pipeline/index.html
index 998ea7561..ab28ab581 100644
--- a/master/modelserving/inference_graph/image_pipeline/index.html
+++ b/master/modelserving/inference_graph/image_pipeline/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/inference_graph/index.html b/master/modelserving/inference_graph/index.html
index a3a6890c8..10a72e2a3 100644
--- a/master/modelserving/inference_graph/index.html
+++ b/master/modelserving/inference_graph/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/kafka/kafka/index.html b/master/modelserving/kafka/kafka/index.html
index 271a00491..1f8ad1c8f 100644
--- a/master/modelserving/kafka/kafka/index.html
+++ b/master/modelserving/kafka/kafka/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/logger/logger/index.html b/master/modelserving/logger/logger/index.html
index 6e9befccb..556a4ff69 100644
--- a/master/modelserving/logger/logger/index.html
+++ b/master/modelserving/logger/logger/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/mms/modelmesh/overview/index.html b/master/modelserving/mms/modelmesh/overview/index.html
index ff3a9b307..289d48aa8 100644
--- a/master/modelserving/mms/modelmesh/overview/index.html
+++ b/master/modelserving/mms/modelmesh/overview/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/mms/multi-model-serving/index.html b/master/modelserving/mms/multi-model-serving/index.html
index 358156086..220ea6e15 100644
--- a/master/modelserving/mms/multi-model-serving/index.html
+++ b/master/modelserving/mms/multi-model-serving/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/nodescheduling/inferenceservicenodescheduling/index.html b/master/modelserving/nodescheduling/inferenceservicenodescheduling/index.html
index 3e3870af5..e82ea9cc8 100644
--- a/master/modelserving/nodescheduling/inferenceservicenodescheduling/index.html
+++ b/master/modelserving/nodescheduling/inferenceservicenodescheduling/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/nodescheduling/overview/index.html b/master/modelserving/nodescheduling/overview/index.html
index 310988904..070173ce3 100644
--- a/master/modelserving/nodescheduling/overview/index.html
+++ b/master/modelserving/nodescheduling/overview/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/observability/grafana_dashboards/index.html b/master/modelserving/observability/grafana_dashboards/index.html
index e9c64401c..251525a79 100644
--- a/master/modelserving/observability/grafana_dashboards/index.html
+++ b/master/modelserving/observability/grafana_dashboards/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/observability/prometheus_metrics/index.html b/master/modelserving/observability/prometheus_metrics/index.html
index 8cde9fad3..b3c429866 100644
--- a/master/modelserving/observability/prometheus_metrics/index.html
+++ b/master/modelserving/observability/prometheus_metrics/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/servingruntimes/index.html b/master/modelserving/servingruntimes/index.html
index 1a019c459..b4b65adca 100644
--- a/master/modelserving/servingruntimes/index.html
+++ b/master/modelserving/servingruntimes/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
@@ -1181,7 +1201,7 @@ <h1 id="macro-syntax-error"><em>Macro Syntax Error</em><a class="headerlink" hre
 </main>
 <footer class="md-footer">
 <nav aria-label="Footer" class="md-footer__inner md-grid">
-<a aria-label="Previous: Open Inference Protocol (V2 Inference Protocol)" class="md-footer__link md-footer__link--prev" href="../data_plane/v2_protocol/" rel="prev">
+<a aria-label="Previous: Binary Tensor Data Extension" class="md-footer__link md-footer__link--prev" href="../data_plane/binary_tensor_data_extension/" rel="prev">
 <div class="md-footer__button md-icon">
 <svg viewbox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12z"></path></svg>
 </div>
@@ -1190,7 +1210,7 @@ <h1 id="macro-syntax-error"><em>Macro Syntax Error</em><a class="headerlink" hre
 <span class="md-footer__direction">
                 Previous
               </span>
-              Open Inference Protocol (V2 Inference Protocol)
+              Binary Tensor Data Extension
             </div>
 </div>
 </a>
diff --git a/master/modelserving/storage/azure/azure/index.html b/master/modelserving/storage/azure/azure/index.html
index 31653cdde..281a268d3 100644
--- a/master/modelserving/storage/azure/azure/index.html
+++ b/master/modelserving/storage/azure/azure/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/storage/gcs/gcs/index.html b/master/modelserving/storage/gcs/gcs/index.html
index 32c8865f2..9c4725737 100644
--- a/master/modelserving/storage/gcs/gcs/index.html
+++ b/master/modelserving/storage/gcs/gcs/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/storage/oci/index.html b/master/modelserving/storage/oci/index.html
index 81ef93f39..e78b432a6 100644
--- a/master/modelserving/storage/oci/index.html
+++ b/master/modelserving/storage/oci/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/storage/pvc/pvc/index.html b/master/modelserving/storage/pvc/pvc/index.html
index 33a15adc2..a9c256c96 100644
--- a/master/modelserving/storage/pvc/pvc/index.html
+++ b/master/modelserving/storage/pvc/pvc/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/storage/s3/s3/index.html b/master/modelserving/storage/s3/s3/index.html
index db2c5dcfb..2e6e0f59b 100644
--- a/master/modelserving/storage/s3/s3/index.html
+++ b/master/modelserving/storage/s3/s3/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/storage/storagecontainers/index.html b/master/modelserving/storage/storagecontainers/index.html
index 9207d1c92..d31568743 100644
--- a/master/modelserving/storage/storagecontainers/index.html
+++ b/master/modelserving/storage/storagecontainers/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/storage/uri/uri/index.html b/master/modelserving/storage/uri/uri/index.html
index 71d436057..685266369 100644
--- a/master/modelserving/storage/uri/uri/index.html
+++ b/master/modelserving/storage/uri/uri/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/amd/index.html b/master/modelserving/v1beta1/amd/index.html
index feb2678c8..bf1a6442b 100644
--- a/master/modelserving/v1beta1/amd/index.html
+++ b/master/modelserving/v1beta1/amd/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/custom/custom_model/index.html b/master/modelserving/v1beta1/custom/custom_model/index.html
index e67e9aeda..f46b256f8 100644
--- a/master/modelserving/v1beta1/custom/custom_model/index.html
+++ b/master/modelserving/v1beta1/custom/custom_model/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/lightgbm/index.html b/master/modelserving/v1beta1/lightgbm/index.html
index 2b67dfba5..6ea8a48ed 100644
--- a/master/modelserving/v1beta1/lightgbm/index.html
+++ b/master/modelserving/v1beta1/lightgbm/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/huggingface/fill_mask/index.html b/master/modelserving/v1beta1/llm/huggingface/fill_mask/index.html
index 8d2e37487..4c48c4589 100644
--- a/master/modelserving/v1beta1/llm/huggingface/fill_mask/index.html
+++ b/master/modelserving/v1beta1/llm/huggingface/fill_mask/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/huggingface/index.html b/master/modelserving/v1beta1/llm/huggingface/index.html
index 6631c2be0..a476ed4e4 100644
--- a/master/modelserving/v1beta1/llm/huggingface/index.html
+++ b/master/modelserving/v1beta1/llm/huggingface/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/huggingface/sdk_integration/index.html b/master/modelserving/v1beta1/llm/huggingface/sdk_integration/index.html
index 394e0d12a..2036db34b 100644
--- a/master/modelserving/v1beta1/llm/huggingface/sdk_integration/index.html
+++ b/master/modelserving/v1beta1/llm/huggingface/sdk_integration/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/huggingface/text2text_generation/index.html b/master/modelserving/v1beta1/llm/huggingface/text2text_generation/index.html
index 300ac5bd4..41c1d16a3 100644
--- a/master/modelserving/v1beta1/llm/huggingface/text2text_generation/index.html
+++ b/master/modelserving/v1beta1/llm/huggingface/text2text_generation/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/huggingface/text_classification/index.html b/master/modelserving/v1beta1/llm/huggingface/text_classification/index.html
index 5964760a7..b54a01a1e 100644
--- a/master/modelserving/v1beta1/llm/huggingface/text_classification/index.html
+++ b/master/modelserving/v1beta1/llm/huggingface/text_classification/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/huggingface/text_generation/index.html b/master/modelserving/v1beta1/llm/huggingface/text_generation/index.html
index 813ce3e8a..e5793e383 100644
--- a/master/modelserving/v1beta1/llm/huggingface/text_generation/index.html
+++ b/master/modelserving/v1beta1/llm/huggingface/text_generation/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/huggingface/token_classification/index.html b/master/modelserving/v1beta1/llm/huggingface/token_classification/index.html
index 8fd77a103..c7ca07b56 100644
--- a/master/modelserving/v1beta1/llm/huggingface/token_classification/index.html
+++ b/master/modelserving/v1beta1/llm/huggingface/token_classification/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/torchserve/accelerate/index.html b/master/modelserving/v1beta1/llm/torchserve/accelerate/index.html
index fa95ac97f..8dd820ce3 100644
--- a/master/modelserving/v1beta1/llm/torchserve/accelerate/index.html
+++ b/master/modelserving/v1beta1/llm/torchserve/accelerate/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/llm/vllm/index.html b/master/modelserving/v1beta1/llm/vllm/index.html
index 160b96735..a45d66641 100644
--- a/master/modelserving/v1beta1/llm/vllm/index.html
+++ b/master/modelserving/v1beta1/llm/vllm/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/mlflow/v2/index.html b/master/modelserving/v1beta1/mlflow/v2/index.html
index c0acc3631..9f50b3dbb 100644
--- a/master/modelserving/v1beta1/mlflow/v2/index.html
+++ b/master/modelserving/v1beta1/mlflow/v2/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/onnx/index.html b/master/modelserving/v1beta1/onnx/index.html
index c20fc6764..96a73bd9a 100644
--- a/master/modelserving/v1beta1/onnx/index.html
+++ b/master/modelserving/v1beta1/onnx/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/paddle/index.html b/master/modelserving/v1beta1/paddle/index.html
index 9ec464632..81e1bdce8 100644
--- a/master/modelserving/v1beta1/paddle/index.html
+++ b/master/modelserving/v1beta1/paddle/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/pmml/index.html b/master/modelserving/v1beta1/pmml/index.html
index a83f57f01..02eae90e6 100644
--- a/master/modelserving/v1beta1/pmml/index.html
+++ b/master/modelserving/v1beta1/pmml/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/rollout/canary-example/index.html b/master/modelserving/v1beta1/rollout/canary-example/index.html
index aa729d85d..bd00bf39c 100644
--- a/master/modelserving/v1beta1/rollout/canary-example/index.html
+++ b/master/modelserving/v1beta1/rollout/canary-example/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/rollout/canary/index.html b/master/modelserving/v1beta1/rollout/canary/index.html
index 919bf25e0..83b1584fc 100644
--- a/master/modelserving/v1beta1/rollout/canary/index.html
+++ b/master/modelserving/v1beta1/rollout/canary/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/serving_runtime/index.html b/master/modelserving/v1beta1/serving_runtime/index.html
index 1c805c0e0..d6723ac38 100644
--- a/master/modelserving/v1beta1/serving_runtime/index.html
+++ b/master/modelserving/v1beta1/serving_runtime/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/sklearn/v2/index.html b/master/modelserving/v1beta1/sklearn/v2/index.html
index af686520f..364eefc2d 100644
--- a/master/modelserving/v1beta1/sklearn/v2/index.html
+++ b/master/modelserving/v1beta1/sklearn/v2/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/spark/index.html b/master/modelserving/v1beta1/spark/index.html
index 08b92b729..287809d08 100644
--- a/master/modelserving/v1beta1/spark/index.html
+++ b/master/modelserving/v1beta1/spark/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/tensorflow/index.html b/master/modelserving/v1beta1/tensorflow/index.html
index bd6c648d6..3dc8fa383 100644
--- a/master/modelserving/v1beta1/tensorflow/index.html
+++ b/master/modelserving/v1beta1/tensorflow/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/torchserve/bert/index.html b/master/modelserving/v1beta1/torchserve/bert/index.html
index f6ee9111e..52f9b6795 100644
--- a/master/modelserving/v1beta1/torchserve/bert/index.html
+++ b/master/modelserving/v1beta1/torchserve/bert/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/torchserve/index.html b/master/modelserving/v1beta1/torchserve/index.html
index 75de947cc..f9dbc00e6 100644
--- a/master/modelserving/v1beta1/torchserve/index.html
+++ b/master/modelserving/v1beta1/torchserve/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/torchserve/metrics/index.html b/master/modelserving/v1beta1/torchserve/metrics/index.html
index f4014f1f7..095adaa47 100644
--- a/master/modelserving/v1beta1/torchserve/metrics/index.html
+++ b/master/modelserving/v1beta1/torchserve/metrics/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/torchserve/model-archiver/index.html b/master/modelserving/v1beta1/torchserve/model-archiver/index.html
index 677f2f1fc..38e9852ae 100644
--- a/master/modelserving/v1beta1/torchserve/model-archiver/index.html
+++ b/master/modelserving/v1beta1/torchserve/model-archiver/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/index.html b/master/modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/index.html
index 9a2e6d90f..c6c702a4c 100644
--- a/master/modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/index.html
+++ b/master/modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/torchserve/model-archiver/model-store/index.html b/master/modelserving/v1beta1/torchserve/model-archiver/model-store/index.html
index 769638b1b..09f4760d0 100644
--- a/master/modelserving/v1beta1/torchserve/model-archiver/model-store/index.html
+++ b/master/modelserving/v1beta1/torchserve/model-archiver/model-store/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/transformer/collocation/index.html b/master/modelserving/v1beta1/transformer/collocation/index.html
index dbcdc6515..fb4a9e231 100644
--- a/master/modelserving/v1beta1/transformer/collocation/index.html
+++ b/master/modelserving/v1beta1/transformer/collocation/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/transformer/feast/index.html b/master/modelserving/v1beta1/transformer/feast/index.html
index db886a837..7f5130bcf 100644
--- a/master/modelserving/v1beta1/transformer/feast/index.html
+++ b/master/modelserving/v1beta1/transformer/feast/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/transformer/torchserve_image_transformer/index.html b/master/modelserving/v1beta1/transformer/torchserve_image_transformer/index.html
index fb6974f29..e7756bd0d 100644
--- a/master/modelserving/v1beta1/transformer/torchserve_image_transformer/index.html
+++ b/master/modelserving/v1beta1/transformer/torchserve_image_transformer/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/triton/bert/index.html b/master/modelserving/v1beta1/triton/bert/index.html
index c734b14c1..6ab466f92 100644
--- a/master/modelserving/v1beta1/triton/bert/index.html
+++ b/master/modelserving/v1beta1/triton/bert/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/triton/huggingface/index.html b/master/modelserving/v1beta1/triton/huggingface/index.html
index dd071e815..be70266c1 100644
--- a/master/modelserving/v1beta1/triton/huggingface/index.html
+++ b/master/modelserving/v1beta1/triton/huggingface/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/triton/torchscript/index.html b/master/modelserving/v1beta1/triton/torchscript/index.html
index 222373486..f7e2e62fa 100644
--- a/master/modelserving/v1beta1/triton/torchscript/index.html
+++ b/master/modelserving/v1beta1/triton/torchscript/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/modelserving/v1beta1/xgboost/index.html b/master/modelserving/v1beta1/xgboost/index.html
index 150e56ad1..990e65a66 100644
--- a/master/modelserving/v1beta1/xgboost/index.html
+++ b/master/modelserving/v1beta1/xgboost/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/python_runtime_api/docs/api/index.html b/master/python_runtime_api/docs/api/index.html
index fc880ab25..018c1e236 100644
--- a/master/python_runtime_api/docs/api/index.html
+++ b/master/python_runtime_api/docs/api/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/python_runtime_api/docs/index.html b/master/python_runtime_api/docs/index.html
index 10bf2a085..7056ad8ae 100644
--- a/master/python_runtime_api/docs/index.html
+++ b/master/python_runtime_api/docs/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/reference/api/index.html b/master/reference/api/index.html
index aba3ff0d1..af43a46fb 100644
--- a/master/reference/api/index.html
+++ b/master/reference/api/index.html
@@ -355,6 +355,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/reference/swagger-ui/index.html b/master/reference/swagger-ui/index.html
index 4804dbeff..a7dfd0f4d 100644
--- a/master/reference/swagger-ui/index.html
+++ b/master/reference/swagger-ui/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/reference/v2_inference/index.html b/master/reference/v2_inference/index.html
index 4a1e8b79a..76bb47657 100644
--- a/master/reference/v2_inference/index.html
+++ b/master/reference/v2_inference/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/reference/v2_inference/template/index.html b/master/reference/v2_inference/template/index.html
index 5549c3819..52d2245fa 100644
--- a/master/reference/v2_inference/template/index.html
+++ b/master/reference/v2_inference/template/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/KServeClient/index.html b/master/sdk_docs/docs/KServeClient/index.html
index 47ac713e2..ea965d1ef 100644
--- a/master/sdk_docs/docs/KServeClient/index.html
+++ b/master/sdk_docs/docs/KServeClient/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/KnativeAddressable/index.html b/master/sdk_docs/docs/KnativeAddressable/index.html
index 22d7ca8d6..a2fff432a 100644
--- a/master/sdk_docs/docs/KnativeAddressable/index.html
+++ b/master/sdk_docs/docs/KnativeAddressable/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/KnativeCondition/index.html b/master/sdk_docs/docs/KnativeCondition/index.html
index 0ededb681..45680cdf1 100644
--- a/master/sdk_docs/docs/KnativeCondition/index.html
+++ b/master/sdk_docs/docs/KnativeCondition/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/KnativeStatus/index.html b/master/sdk_docs/docs/KnativeStatus/index.html
index 5d4680716..d6338340c 100644
--- a/master/sdk_docs/docs/KnativeStatus/index.html
+++ b/master/sdk_docs/docs/KnativeStatus/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/KnativeURL/index.html b/master/sdk_docs/docs/KnativeURL/index.html
index 8172860c5..5e93ce641 100644
--- a/master/sdk_docs/docs/KnativeURL/index.html
+++ b/master/sdk_docs/docs/KnativeURL/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/KnativeVolatileTime/index.html b/master/sdk_docs/docs/KnativeVolatileTime/index.html
index 18dec897b..d2983d134 100644
--- a/master/sdk_docs/docs/KnativeVolatileTime/index.html
+++ b/master/sdk_docs/docs/KnativeVolatileTime/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/NetUrlUserinfo/index.html b/master/sdk_docs/docs/NetUrlUserinfo/index.html
index 40738c62d..43eb9cf70 100644
--- a/master/sdk_docs/docs/NetUrlUserinfo/index.html
+++ b/master/sdk_docs/docs/NetUrlUserinfo/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1Time/index.html b/master/sdk_docs/docs/V1Time/index.html
index bd27dbabd..513eae125 100644
--- a/master/sdk_docs/docs/V1Time/index.html
+++ b/master/sdk_docs/docs/V1Time/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1BuiltInAdapter/index.html b/master/sdk_docs/docs/V1alpha1BuiltInAdapter/index.html
index caeefa4d9..b2a753c34 100644
--- a/master/sdk_docs/docs/V1alpha1BuiltInAdapter/index.html
+++ b/master/sdk_docs/docs/V1alpha1BuiltInAdapter/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1ClusterServingRuntime/index.html b/master/sdk_docs/docs/V1alpha1ClusterServingRuntime/index.html
index 3795bafb7..193ce668c 100644
--- a/master/sdk_docs/docs/V1alpha1ClusterServingRuntime/index.html
+++ b/master/sdk_docs/docs/V1alpha1ClusterServingRuntime/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1ClusterServingRuntimeList/index.html b/master/sdk_docs/docs/V1alpha1ClusterServingRuntimeList/index.html
index 98c29eb9d..cf6e7c09e 100644
--- a/master/sdk_docs/docs/V1alpha1ClusterServingRuntimeList/index.html
+++ b/master/sdk_docs/docs/V1alpha1ClusterServingRuntimeList/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1Container/index.html b/master/sdk_docs/docs/V1alpha1Container/index.html
index a3f0e89f3..f58a5478d 100644
--- a/master/sdk_docs/docs/V1alpha1Container/index.html
+++ b/master/sdk_docs/docs/V1alpha1Container/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1InferenceGraph/index.html b/master/sdk_docs/docs/V1alpha1InferenceGraph/index.html
index d1f3a1f5d..2d6782989 100644
--- a/master/sdk_docs/docs/V1alpha1InferenceGraph/index.html
+++ b/master/sdk_docs/docs/V1alpha1InferenceGraph/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1InferenceGraphList/index.html b/master/sdk_docs/docs/V1alpha1InferenceGraphList/index.html
index d9f8bdd6d..0a40a9ecf 100644
--- a/master/sdk_docs/docs/V1alpha1InferenceGraphList/index.html
+++ b/master/sdk_docs/docs/V1alpha1InferenceGraphList/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1InferenceGraphSpec/index.html b/master/sdk_docs/docs/V1alpha1InferenceGraphSpec/index.html
index fe2d8b3c8..49af9bcea 100644
--- a/master/sdk_docs/docs/V1alpha1InferenceGraphSpec/index.html
+++ b/master/sdk_docs/docs/V1alpha1InferenceGraphSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1InferenceGraphStatus/index.html b/master/sdk_docs/docs/V1alpha1InferenceGraphStatus/index.html
index f671a04b9..13410a9cf 100644
--- a/master/sdk_docs/docs/V1alpha1InferenceGraphStatus/index.html
+++ b/master/sdk_docs/docs/V1alpha1InferenceGraphStatus/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1InferenceRouter/index.html b/master/sdk_docs/docs/V1alpha1InferenceRouter/index.html
index ecb43333b..a049431a0 100644
--- a/master/sdk_docs/docs/V1alpha1InferenceRouter/index.html
+++ b/master/sdk_docs/docs/V1alpha1InferenceRouter/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1InferenceStep/index.html b/master/sdk_docs/docs/V1alpha1InferenceStep/index.html
index 5580f46fc..21a17cbfe 100644
--- a/master/sdk_docs/docs/V1alpha1InferenceStep/index.html
+++ b/master/sdk_docs/docs/V1alpha1InferenceStep/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1InferenceTarget/index.html b/master/sdk_docs/docs/V1alpha1InferenceTarget/index.html
index 072fe591d..b2712e8f2 100644
--- a/master/sdk_docs/docs/V1alpha1InferenceTarget/index.html
+++ b/master/sdk_docs/docs/V1alpha1InferenceTarget/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1ServingRuntime/index.html b/master/sdk_docs/docs/V1alpha1ServingRuntime/index.html
index b08424532..e207a90f5 100644
--- a/master/sdk_docs/docs/V1alpha1ServingRuntime/index.html
+++ b/master/sdk_docs/docs/V1alpha1ServingRuntime/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1ServingRuntimeList/index.html b/master/sdk_docs/docs/V1alpha1ServingRuntimeList/index.html
index 3822a3ed7..372b62b0a 100644
--- a/master/sdk_docs/docs/V1alpha1ServingRuntimeList/index.html
+++ b/master/sdk_docs/docs/V1alpha1ServingRuntimeList/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1ServingRuntimePodSpec/index.html b/master/sdk_docs/docs/V1alpha1ServingRuntimePodSpec/index.html
index 92d9cfc5c..dd6ed2dab 100644
--- a/master/sdk_docs/docs/V1alpha1ServingRuntimePodSpec/index.html
+++ b/master/sdk_docs/docs/V1alpha1ServingRuntimePodSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1ServingRuntimeSpec/index.html b/master/sdk_docs/docs/V1alpha1ServingRuntimeSpec/index.html
index a49bc83c5..93ddb6e69 100644
--- a/master/sdk_docs/docs/V1alpha1ServingRuntimeSpec/index.html
+++ b/master/sdk_docs/docs/V1alpha1ServingRuntimeSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1StorageHelper/index.html b/master/sdk_docs/docs/V1alpha1StorageHelper/index.html
index 2c7c362af..c4db5c592 100644
--- a/master/sdk_docs/docs/V1alpha1StorageHelper/index.html
+++ b/master/sdk_docs/docs/V1alpha1StorageHelper/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1alpha1SupportedModelFormat/index.html b/master/sdk_docs/docs/V1alpha1SupportedModelFormat/index.html
index 8323bd56f..d5367c008 100644
--- a/master/sdk_docs/docs/V1alpha1SupportedModelFormat/index.html
+++ b/master/sdk_docs/docs/V1alpha1SupportedModelFormat/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1AIXExplainerSpec/index.html b/master/sdk_docs/docs/V1beta1AIXExplainerSpec/index.html
index 365030150..44e99c14d 100644
--- a/master/sdk_docs/docs/V1beta1AIXExplainerSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1AIXExplainerSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ARTExplainerSpec/index.html b/master/sdk_docs/docs/V1beta1ARTExplainerSpec/index.html
index 0ed7c4c08..bc6b6fd2d 100644
--- a/master/sdk_docs/docs/V1beta1ARTExplainerSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1ARTExplainerSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1AlibiExplainerSpec/index.html b/master/sdk_docs/docs/V1beta1AlibiExplainerSpec/index.html
index 94e7fad69..97112f625 100644
--- a/master/sdk_docs/docs/V1beta1AlibiExplainerSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1AlibiExplainerSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1Batcher/index.html b/master/sdk_docs/docs/V1beta1Batcher/index.html
index 82e581615..b19e97719 100644
--- a/master/sdk_docs/docs/V1beta1Batcher/index.html
+++ b/master/sdk_docs/docs/V1beta1Batcher/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ComponentExtensionSpec/index.html b/master/sdk_docs/docs/V1beta1ComponentExtensionSpec/index.html
index 93b1f00d9..5325c6fd6 100644
--- a/master/sdk_docs/docs/V1beta1ComponentExtensionSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1ComponentExtensionSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ComponentStatusSpec/index.html b/master/sdk_docs/docs/V1beta1ComponentStatusSpec/index.html
index fec0637d9..f778380df 100644
--- a/master/sdk_docs/docs/V1beta1ComponentStatusSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1ComponentStatusSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1CustomExplainer/index.html b/master/sdk_docs/docs/V1beta1CustomExplainer/index.html
index d732541b1..60f6164ba 100644
--- a/master/sdk_docs/docs/V1beta1CustomExplainer/index.html
+++ b/master/sdk_docs/docs/V1beta1CustomExplainer/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1CustomPredictor/index.html b/master/sdk_docs/docs/V1beta1CustomPredictor/index.html
index 36f0f33b0..c3c277aaa 100644
--- a/master/sdk_docs/docs/V1beta1CustomPredictor/index.html
+++ b/master/sdk_docs/docs/V1beta1CustomPredictor/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1CustomTransformer/index.html b/master/sdk_docs/docs/V1beta1CustomTransformer/index.html
index 51ae28a66..54eb7d69b 100644
--- a/master/sdk_docs/docs/V1beta1CustomTransformer/index.html
+++ b/master/sdk_docs/docs/V1beta1CustomTransformer/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ExplainerConfig/index.html b/master/sdk_docs/docs/V1beta1ExplainerConfig/index.html
index 2cfd45b89..c3e3bb289 100644
--- a/master/sdk_docs/docs/V1beta1ExplainerConfig/index.html
+++ b/master/sdk_docs/docs/V1beta1ExplainerConfig/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ExplainerExtensionSpec/index.html b/master/sdk_docs/docs/V1beta1ExplainerExtensionSpec/index.html
index 90e003b89..5f10a511e 100644
--- a/master/sdk_docs/docs/V1beta1ExplainerExtensionSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1ExplainerExtensionSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ExplainerSpec/index.html b/master/sdk_docs/docs/V1beta1ExplainerSpec/index.html
index 81fa6892c..e8e35fdcf 100644
--- a/master/sdk_docs/docs/V1beta1ExplainerSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1ExplainerSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ExplainersConfig/index.html b/master/sdk_docs/docs/V1beta1ExplainersConfig/index.html
index 3a36091f7..7ebb4ab82 100644
--- a/master/sdk_docs/docs/V1beta1ExplainersConfig/index.html
+++ b/master/sdk_docs/docs/V1beta1ExplainersConfig/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1FailureInfo/index.html b/master/sdk_docs/docs/V1beta1FailureInfo/index.html
index 050c26a80..ef37308f5 100644
--- a/master/sdk_docs/docs/V1beta1FailureInfo/index.html
+++ b/master/sdk_docs/docs/V1beta1FailureInfo/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1InferenceService/index.html b/master/sdk_docs/docs/V1beta1InferenceService/index.html
index 89153aa1e..b4bc8f2fa 100644
--- a/master/sdk_docs/docs/V1beta1InferenceService/index.html
+++ b/master/sdk_docs/docs/V1beta1InferenceService/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1InferenceServiceList/index.html b/master/sdk_docs/docs/V1beta1InferenceServiceList/index.html
index 11ad3eea1..5983e3af0 100644
--- a/master/sdk_docs/docs/V1beta1InferenceServiceList/index.html
+++ b/master/sdk_docs/docs/V1beta1InferenceServiceList/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1InferenceServiceSpec/index.html b/master/sdk_docs/docs/V1beta1InferenceServiceSpec/index.html
index 6d39ac4c9..eceb2a42f 100644
--- a/master/sdk_docs/docs/V1beta1InferenceServiceSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1InferenceServiceSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1InferenceServiceStatus/index.html b/master/sdk_docs/docs/V1beta1InferenceServiceStatus/index.html
index 6dfd787f7..826fc284f 100644
--- a/master/sdk_docs/docs/V1beta1InferenceServiceStatus/index.html
+++ b/master/sdk_docs/docs/V1beta1InferenceServiceStatus/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1InferenceServicesConfig/index.html b/master/sdk_docs/docs/V1beta1InferenceServicesConfig/index.html
index 63e963e8e..1fc6d3266 100644
--- a/master/sdk_docs/docs/V1beta1InferenceServicesConfig/index.html
+++ b/master/sdk_docs/docs/V1beta1InferenceServicesConfig/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1IngressConfig/index.html b/master/sdk_docs/docs/V1beta1IngressConfig/index.html
index 22ac957fb..8375339ed 100644
--- a/master/sdk_docs/docs/V1beta1IngressConfig/index.html
+++ b/master/sdk_docs/docs/V1beta1IngressConfig/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1LightGBMSpec/index.html b/master/sdk_docs/docs/V1beta1LightGBMSpec/index.html
index 79a5ffbd9..e6dee9031 100644
--- a/master/sdk_docs/docs/V1beta1LightGBMSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1LightGBMSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1LoggerSpec/index.html b/master/sdk_docs/docs/V1beta1LoggerSpec/index.html
index b83735381..8f4ed60e0 100644
--- a/master/sdk_docs/docs/V1beta1LoggerSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1LoggerSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ModelCopies/index.html b/master/sdk_docs/docs/V1beta1ModelCopies/index.html
index ebc3bbe89..0f21fe635 100644
--- a/master/sdk_docs/docs/V1beta1ModelCopies/index.html
+++ b/master/sdk_docs/docs/V1beta1ModelCopies/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ModelFormat/index.html b/master/sdk_docs/docs/V1beta1ModelFormat/index.html
index 903fb5d64..b78ac182b 100644
--- a/master/sdk_docs/docs/V1beta1ModelFormat/index.html
+++ b/master/sdk_docs/docs/V1beta1ModelFormat/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ModelRevisionStates/index.html b/master/sdk_docs/docs/V1beta1ModelRevisionStates/index.html
index e5372f8fa..74dce2cad 100644
--- a/master/sdk_docs/docs/V1beta1ModelRevisionStates/index.html
+++ b/master/sdk_docs/docs/V1beta1ModelRevisionStates/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ModelSpec/index.html b/master/sdk_docs/docs/V1beta1ModelSpec/index.html
index 0808dd67d..07e926075 100644
--- a/master/sdk_docs/docs/V1beta1ModelSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1ModelSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ModelStatus/index.html b/master/sdk_docs/docs/V1beta1ModelStatus/index.html
index 4cd8762b8..85b5165bc 100644
--- a/master/sdk_docs/docs/V1beta1ModelStatus/index.html
+++ b/master/sdk_docs/docs/V1beta1ModelStatus/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1ONNXRuntimeSpec/index.html b/master/sdk_docs/docs/V1beta1ONNXRuntimeSpec/index.html
index 83e165969..69cab8453 100644
--- a/master/sdk_docs/docs/V1beta1ONNXRuntimeSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1ONNXRuntimeSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1PMMLSpec/index.html b/master/sdk_docs/docs/V1beta1PMMLSpec/index.html
index 9aae08efb..5a2884434 100644
--- a/master/sdk_docs/docs/V1beta1PMMLSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1PMMLSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1PaddleServerSpec/index.html b/master/sdk_docs/docs/V1beta1PaddleServerSpec/index.html
index e78f59817..93d66a212 100644
--- a/master/sdk_docs/docs/V1beta1PaddleServerSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1PaddleServerSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1PodSpec/index.html b/master/sdk_docs/docs/V1beta1PodSpec/index.html
index 9cb8d272b..dafc7c4cf 100644
--- a/master/sdk_docs/docs/V1beta1PodSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1PodSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1PredictorConfig/index.html b/master/sdk_docs/docs/V1beta1PredictorConfig/index.html
index 4e10a2e2f..f59378301 100644
--- a/master/sdk_docs/docs/V1beta1PredictorConfig/index.html
+++ b/master/sdk_docs/docs/V1beta1PredictorConfig/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1PredictorExtensionSpec/index.html b/master/sdk_docs/docs/V1beta1PredictorExtensionSpec/index.html
index 18fd395c5..f4fb32943 100644
--- a/master/sdk_docs/docs/V1beta1PredictorExtensionSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1PredictorExtensionSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1PredictorProtocols/index.html b/master/sdk_docs/docs/V1beta1PredictorProtocols/index.html
index b072e6c6e..d4ee0b6ad 100644
--- a/master/sdk_docs/docs/V1beta1PredictorProtocols/index.html
+++ b/master/sdk_docs/docs/V1beta1PredictorProtocols/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1PredictorSpec/index.html b/master/sdk_docs/docs/V1beta1PredictorSpec/index.html
index cf5e08469..b7c2bb559 100644
--- a/master/sdk_docs/docs/V1beta1PredictorSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1PredictorSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1PredictorsConfig/index.html b/master/sdk_docs/docs/V1beta1PredictorsConfig/index.html
index b8729ee0d..c29115999 100644
--- a/master/sdk_docs/docs/V1beta1PredictorsConfig/index.html
+++ b/master/sdk_docs/docs/V1beta1PredictorsConfig/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1SKLearnSpec/index.html b/master/sdk_docs/docs/V1beta1SKLearnSpec/index.html
index 633afc9d9..1c8a00049 100644
--- a/master/sdk_docs/docs/V1beta1SKLearnSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1SKLearnSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1StorageSpec/index.html b/master/sdk_docs/docs/V1beta1StorageSpec/index.html
index 11471fcbe..812b499e1 100644
--- a/master/sdk_docs/docs/V1beta1StorageSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1StorageSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1TFServingSpec/index.html b/master/sdk_docs/docs/V1beta1TFServingSpec/index.html
index 8b4ad1a92..372a50505 100644
--- a/master/sdk_docs/docs/V1beta1TFServingSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1TFServingSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1TorchServeSpec/index.html b/master/sdk_docs/docs/V1beta1TorchServeSpec/index.html
index 505f9b945..a0a3f67e8 100644
--- a/master/sdk_docs/docs/V1beta1TorchServeSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1TorchServeSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1TransformerConfig/index.html b/master/sdk_docs/docs/V1beta1TransformerConfig/index.html
index 73bfb9c87..054fc0c3e 100644
--- a/master/sdk_docs/docs/V1beta1TransformerConfig/index.html
+++ b/master/sdk_docs/docs/V1beta1TransformerConfig/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1TransformerSpec/index.html b/master/sdk_docs/docs/V1beta1TransformerSpec/index.html
index 303c39194..dc964c8f7 100644
--- a/master/sdk_docs/docs/V1beta1TransformerSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1TransformerSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1TransformersConfig/index.html b/master/sdk_docs/docs/V1beta1TransformersConfig/index.html
index 985979710..89ea75079 100644
--- a/master/sdk_docs/docs/V1beta1TransformersConfig/index.html
+++ b/master/sdk_docs/docs/V1beta1TransformersConfig/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1TritonSpec/index.html b/master/sdk_docs/docs/V1beta1TritonSpec/index.html
index daa2f50c9..9792f1f23 100644
--- a/master/sdk_docs/docs/V1beta1TritonSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1TritonSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/docs/V1beta1XGBoostSpec/index.html b/master/sdk_docs/docs/V1beta1XGBoostSpec/index.html
index 52bf781c9..45dd14f06 100644
--- a/master/sdk_docs/docs/V1beta1XGBoostSpec/index.html
+++ b/master/sdk_docs/docs/V1beta1XGBoostSpec/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/sdk_docs/sdk_doc/index.html b/master/sdk_docs/sdk_doc/index.html
index bb26021f6..9af9cde65 100644
--- a/master/sdk_docs/sdk_doc/index.html
+++ b/master/sdk_docs/sdk_doc/index.html
@@ -358,6 +358,26 @@ <h1>
         Open Inference Protocol (V2 Inference Protocol)
       </a>
 </li>
+<li class="md-nav__item md-nav__item--nested">
+<input class="md-nav__toggle md-toggle" data-md-toggle="__nav_4_1_2_4" id="__nav_4_1_2_4" type="checkbox"/>
+<label class="md-nav__link" for="__nav_4_1_2_4">
+          Open Inference Protocol Extensions
+          <span class="md-nav__icon md-icon"></span>
+</label>
+<nav aria-label="Open Inference Protocol Extensions" class="md-nav" data-md-level="4">
+<label class="md-nav__title" for="__nav_4_1_2_4">
+<span class="md-nav__icon md-icon"></span>
+          Open Inference Protocol Extensions
+        </label>
+<ul class="md-nav__list" data-md-scrollfix="">
+<li class="md-nav__item">
+<a class="md-nav__link" href="../../modelserving/data_plane/binary_tensor_data_extension/">
+        Binary Tensor Data Extension
+      </a>
+</li>
+</ul>
+</nav>
+</li>
 </ul>
 </nav>
 </li>
diff --git a/master/search/search_index.json b/master/search/search_index.json
index b525289f6..e115409d3 100644
--- a/master/search/search_index.json
+++ b/master/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"","title":"Home"},{"location":"admin/kubernetes_deployment/","text":"Kubernetes Deployment Installation Guide \u00b6 KServe supports RawDeployment mode to enable InferenceService deployment with Kubernetes resources Deployment , Service , Ingress and Horizontal Pod Autoscaler . Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand Scale down and from Zero is not supported in RawDeployment mode. Kubernetes 1.22 is the minimally required version and please check the following recommended Istio versions for the corresponding Kubernetes version. Recommended Version Matrix \u00b6 Kubernetes Version Recommended Istio Version 1.27 1.18, 1.19 1.28 1.19, 1.20 1.29 1.20, 1.21 1. Install Istio \u00b6 The minimally required Istio version is 1.13 and you can refer to the Istio install guide . Once Istio is installed, create IngressClass resource for istio. apiVersion : networking.k8s.io/v1 kind : IngressClass metadata : name : istio spec : controller : istio.io/ingress-controller Note Istio ingress is recommended, but you can choose to install with other Ingress controllers and create IngressClass resource for your Ingress option. 2. Install Cert Manager \u00b6 The minimally required Cert Manager version is 1.15.0 and you can refer to Cert Manager installation guide . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script. 3. Install KServe \u00b6 Note The default KServe deployment mode is Serverless which depends on Knative. The following step changes the default deployment mode to RawDeployment before installing KServe. i. Install KServe kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml Install KServe default serving runtimes: kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml ii. Change default deployment mode and ingress option First in ConfigMap inferenceservice-config modify the defaultDeploymentMode in the deploy section, kubectl kubectl patch configmap/inferenceservice-config -n kserve --type = strategic -p '{\"data\": {\"deploy\": \"{\\\"defaultDeploymentMode\\\": \\\"RawDeployment\\\"}\"}}' then modify the ingressClassName in ingress section to point to IngressClass name created in step 1 . ingress : |- { \"ingressClassName\" : \"your-ingress-class\" , }","title":"Kubernetes deployment installation"},{"location":"admin/kubernetes_deployment/#kubernetes-deployment-installation-guide","text":"KServe supports RawDeployment mode to enable InferenceService deployment with Kubernetes resources Deployment , Service , Ingress and Horizontal Pod Autoscaler . Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand Scale down and from Zero is not supported in RawDeployment mode. Kubernetes 1.22 is the minimally required version and please check the following recommended Istio versions for the corresponding Kubernetes version.","title":"Kubernetes Deployment Installation Guide"},{"location":"admin/kubernetes_deployment/#recommended-version-matrix","text":"Kubernetes Version Recommended Istio Version 1.27 1.18, 1.19 1.28 1.19, 1.20 1.29 1.20, 1.21","title":"Recommended Version Matrix"},{"location":"admin/kubernetes_deployment/#1-install-istio","text":"The minimally required Istio version is 1.13 and you can refer to the Istio install guide . Once Istio is installed, create IngressClass resource for istio. apiVersion : networking.k8s.io/v1 kind : IngressClass metadata : name : istio spec : controller : istio.io/ingress-controller Note Istio ingress is recommended, but you can choose to install with other Ingress controllers and create IngressClass resource for your Ingress option.","title":"1. Install Istio"},{"location":"admin/kubernetes_deployment/#2-install-cert-manager","text":"The minimally required Cert Manager version is 1.15.0 and you can refer to Cert Manager installation guide . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script.","title":"2. Install Cert Manager"},{"location":"admin/kubernetes_deployment/#3-install-kserve","text":"Note The default KServe deployment mode is Serverless which depends on Knative. The following step changes the default deployment mode to RawDeployment before installing KServe. i. Install KServe kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml Install KServe default serving runtimes: kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml ii. Change default deployment mode and ingress option First in ConfigMap inferenceservice-config modify the defaultDeploymentMode in the deploy section, kubectl kubectl patch configmap/inferenceservice-config -n kserve --type = strategic -p '{\"data\": {\"deploy\": \"{\\\"defaultDeploymentMode\\\": \\\"RawDeployment\\\"}\"}}' then modify the ingressClassName in ingress section to point to IngressClass name created in step 1 . ingress : |- { \"ingressClassName\" : \"your-ingress-class\" , }","title":"3. Install KServe"},{"location":"admin/migration/","text":"Migrating from KFServing \u00b6 This doc explains how to migrate existing inference services from KFServing to KServe without downtime. Note The migration job will by default delete the leftover KFServing installation after migrating the inference services from serving.kubeflow.org to serving.kserve.io . Migrating from standalone KFServing \u00b6 Install KServe v0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve.yaml Run the KServe Migration YAML This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it If your KFServing is installed in a namespace other than kfserving-system , then download and set the env KFSERVING_NAMESPACE in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kserve Migrating from Kubeflow-based KFServing \u00b6 Install Kubeflow-based KServe 0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve_kubeflow.yaml Run the KServe Migration YAML for Kubeflow-based installations This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job_kubeflow.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kubeflow Update the models web app to use the new InferenceService API group serving.kserve.io Change the deployment image to kserve/models-web-app:v0.7.0-rc0 This is a temporary fix until the next Kubeflow release includes these changes kubectl edit deployment kfserving-models-web-app -n kubeflow Update the cluster role to be able to access the new InferenceService API group serving.kserve.io Edit the apiGroups from serving.kubeflow.org to serving.kserve.io This is a temporary fix until the next Kubeflow release includes these changes kubectl edit clusterrole kfserving-models-web-app-cluster-role","title":"Migrating from KFServing"},{"location":"admin/migration/#migrating-from-kfserving","text":"This doc explains how to migrate existing inference services from KFServing to KServe without downtime. Note The migration job will by default delete the leftover KFServing installation after migrating the inference services from serving.kubeflow.org to serving.kserve.io .","title":"Migrating from KFServing"},{"location":"admin/migration/#migrating-from-standalone-kfserving","text":"Install KServe v0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve.yaml Run the KServe Migration YAML This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it If your KFServing is installed in a namespace other than kfserving-system , then download and set the env KFSERVING_NAMESPACE in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kserve","title":"Migrating from standalone KFServing"},{"location":"admin/migration/#migrating-from-kubeflow-based-kfserving","text":"Install Kubeflow-based KServe 0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve_kubeflow.yaml Run the KServe Migration YAML for Kubeflow-based installations This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job_kubeflow.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kubeflow Update the models web app to use the new InferenceService API group serving.kserve.io Change the deployment image to kserve/models-web-app:v0.7.0-rc0 This is a temporary fix until the next Kubeflow release includes these changes kubectl edit deployment kfserving-models-web-app -n kubeflow Update the cluster role to be able to access the new InferenceService API group serving.kserve.io Edit the apiGroups from serving.kubeflow.org to serving.kserve.io This is a temporary fix until the next Kubeflow release includes these changes kubectl edit clusterrole kfserving-models-web-app-cluster-role","title":"Migrating from Kubeflow-based KFServing"},{"location":"admin/modelmesh/","text":"ModelMesh Installation Guide \u00b6 KServe ModelMesh installation enables high-scale, high-density and frequently-changing model serving use cases. A Kubernetes cluster is required. You will need cluster-admin authority. Additionally, kustomize and an etcd server on the Kubernetes cluster are required. 1. Standard Installation \u00b6 You can find the standard installation instructions in the ModelMesh Serving installation guide . This approach assumes you have installed the prerequisites such as etcd and S3-compatible object storage. 2. Quick Installation \u00b6 A quick installation allows you to quickly get ModelMesh Serving up and running without having to manually install the prerequisites. The steps are described in the ModelMesh Serving quick start guide . Note ModelMesh Serving is namespace scoped, meaning all of its components must exist within a single namespace and only one instance of ModelMesh Serving can be installed per namespace. For more details, you can check out the ModelMesh Serving getting started guide .","title":"ModelMesh installation"},{"location":"admin/modelmesh/#modelmesh-installation-guide","text":"KServe ModelMesh installation enables high-scale, high-density and frequently-changing model serving use cases. A Kubernetes cluster is required. You will need cluster-admin authority. Additionally, kustomize and an etcd server on the Kubernetes cluster are required.","title":"ModelMesh Installation Guide"},{"location":"admin/modelmesh/#1-standard-installation","text":"You can find the standard installation instructions in the ModelMesh Serving installation guide . This approach assumes you have installed the prerequisites such as etcd and S3-compatible object storage.","title":"1. Standard Installation"},{"location":"admin/modelmesh/#2-quick-installation","text":"A quick installation allows you to quickly get ModelMesh Serving up and running without having to manually install the prerequisites. The steps are described in the ModelMesh Serving quick start guide . Note ModelMesh Serving is namespace scoped, meaning all of its components must exist within a single namespace and only one instance of ModelMesh Serving can be installed per namespace. For more details, you can check out the ModelMesh Serving getting started guide .","title":"2. Quick Installation"},{"location":"admin/serverless/serverless/","text":"Serverless Installation Guide \u00b6 KServe Serverless installation enables autoscaling based on request volume and supports scale down to and from zero. It also supports revision management and canary rollout based on revisions. Kubernetes 1.28 is the minimally required version and please check the following recommended Knative, Istio versions for the corresponding Kubernetes version. Recommended Version Matrix \u00b6 Kubernetes Version Recommended Istio Version Recommended Knative Version 1.28 1.22 1.15 1.29 1.22,1.23 1.15,1.16 1.30 1.22,1.23 1.15,1.16 1. Install Knative Serving \u00b6 Please refer to Knative Serving install guide . Note If you are looking to use PodSpec fields such as nodeSelector, affinity or tolerations which are now supported in the v1beta1 API spec, you need to turn on the corresponding feature flags in your Knative configuration. Warning Knative 1.13.1 requires Istio 1.20+, gRPC routing does not work with previous Istio releases, see release notes . 2. Install Networking Layer \u00b6 The recommended networking layer for KServe is Istio as currently it works best with KServe, please refer to the Istio install guide . Alternatively you can also choose other networking layers like Kourier or Contour , see how to install Kourier with KServe guide . 3. Install Cert Manager \u00b6 The minimally required Cert Manager version is 1.15.0 and you can refer to Cert Manager . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script. 4. Install KServe \u00b6 kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml 5. Install KServe Built-in ClusterServingRuntimes \u00b6 0.13.0 kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml Note ClusterServingRuntimes are required to create InferenceService for built-in model serving runtimes with KServe v0.8.0 or higher.","title":"Serverless installation"},{"location":"admin/serverless/serverless/#serverless-installation-guide","text":"KServe Serverless installation enables autoscaling based on request volume and supports scale down to and from zero. It also supports revision management and canary rollout based on revisions. Kubernetes 1.28 is the minimally required version and please check the following recommended Knative, Istio versions for the corresponding Kubernetes version.","title":"Serverless Installation Guide"},{"location":"admin/serverless/serverless/#recommended-version-matrix","text":"Kubernetes Version Recommended Istio Version Recommended Knative Version 1.28 1.22 1.15 1.29 1.22,1.23 1.15,1.16 1.30 1.22,1.23 1.15,1.16","title":"Recommended Version Matrix"},{"location":"admin/serverless/serverless/#1-install-knative-serving","text":"Please refer to Knative Serving install guide . Note If you are looking to use PodSpec fields such as nodeSelector, affinity or tolerations which are now supported in the v1beta1 API spec, you need to turn on the corresponding feature flags in your Knative configuration. Warning Knative 1.13.1 requires Istio 1.20+, gRPC routing does not work with previous Istio releases, see release notes .","title":"1. Install Knative Serving"},{"location":"admin/serverless/serverless/#2-install-networking-layer","text":"The recommended networking layer for KServe is Istio as currently it works best with KServe, please refer to the Istio install guide . Alternatively you can also choose other networking layers like Kourier or Contour , see how to install Kourier with KServe guide .","title":"2. Install Networking Layer"},{"location":"admin/serverless/serverless/#3-install-cert-manager","text":"The minimally required Cert Manager version is 1.15.0 and you can refer to Cert Manager . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script.","title":"3. Install Cert Manager"},{"location":"admin/serverless/serverless/#4-install-kserve","text":"kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml","title":"4. Install KServe"},{"location":"admin/serverless/serverless/#5-install-kserve-built-in-clusterservingruntimes","text":"0.13.0 kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml Note ClusterServingRuntimes are required to create InferenceService for built-in model serving runtimes with KServe v0.8.0 or higher.","title":"5. Install KServe Built-in ClusterServingRuntimes"},{"location":"admin/serverless/kourier_networking/","text":"Deploy InferenceService with Alternative Networking Layer \u00b6 KServe creates the top level Istio Virtual Service for routing to InferenceService components based on the virtual host or path based routing. Now KServe provides an option for disabling the top level virtual service to allow configuring other networking layers Knative supports. For example, Kourier is an alternative networking layer and the following steps show how you can deploy KServe with Kourier . Install Kourier Networking Layer \u00b6 Please refer to the Serverless Installation Guide and change the second step to install Kourier instead of Istio . Install the Kourier networking layer: kubectl apply -f https://github.com/knative/net-kourier/releases/download/ ${ KNATIVE_VERSION } /kourier.yaml Configure Knative Serving to use Kourier: kubectl patch configmap/config-network \\ --namespace knative-serving \\ --type merge \\ --patch '{\"data\":{\"ingress-class\":\"kourier.ingress.networking.knative.dev\"}}' Verify Kourier installation: kubectl get pods -n knative-serving && kubectl get pods -n kourier-system Expected Output NAME READY STATUS RESTARTS AGE activator-77db7d9dd7-kbrgr 1 /1 Running 0 10m autoscaler-67dbf79b95-htnp9 1 /1 Running 0 10m controller-684b6bc97f-ffm58 1 /1 Running 0 10m domain-mapping-6d99d99978-ktmrf 1 /1 Running 0 10m domainmapping-webhook-5f998498b6-sddnm 1 /1 Running 0 10m net-kourier-controller-68967d76dc-ncj2n 1 /1 Running 0 10m webhook-97bdc7b4d-nr7qf 1 /1 Running 0 10m NAME READY STATUS RESTARTS AGE 3scale-kourier-gateway-54c49c8ff5-x8tgn 1 /1 Running 0 10m Edit inferenceservice-config configmap to disable Istio top level virtual host: kubectl edit configmap/inferenceservice-config --namespace kserve # Add the flag `\"disableIstioVirtualHost\": true` under the ingress section ingress : | - { \"disableIstioVirtualHost\" : true } Restart the KServe Controller kubectl rollout restart deployment kserve-controller-manager -n kserve Deploy InferenceService for Testing Kourier Gateway \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Run a Prediction \u00b6 Note that when setting INGRESS_HOST and INGRESS_PORT following the determining the ingress IP and ports guide you need to replace istio-ingressgateway with kourier-gateway . For example if you choose to do Port Forward for testing you need to select the kourier-gateway pod as following. kubectl port-forward --namespace kourier-system \\ $( kubectl get pod -n kourier-system -l \"app=3scale-kourier-gateway\" --output = jsonpath = \"{.items[0].metadata.name}\" ) 8080 :8080 export INGRESS_HOST = localhost export INGRESS_PORT = 8080 Make sure that you create a file named pmml-input.json with the following content, under your current terminal path. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } Send a prediction request to the InferenceService and check the output. MODEL_NAME = pmml-demo INPUT_PATH = @./pmml-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo-predictor-default.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 144 < content-type: application/json ; charset = UTF-8 < date: Wed, 14 Sep 2022 13 :30:09 GMT < server: envoy < x-envoy-upstream-service-time: 58 < * Connection #0 to host localhost left intact { \"predictions\" : [{ \"Species\" : \"setosa\" , \"Probability_setosa\" : 1 .0, \"Probability_versicolor\" : 0 .0, \"Probability_virginica\" : 0 .0, \"Node_Id\" : \"2\" }]}","title":"Kourier Networking Layer"},{"location":"admin/serverless/kourier_networking/#deploy-inferenceservice-with-alternative-networking-layer","text":"KServe creates the top level Istio Virtual Service for routing to InferenceService components based on the virtual host or path based routing. Now KServe provides an option for disabling the top level virtual service to allow configuring other networking layers Knative supports. For example, Kourier is an alternative networking layer and the following steps show how you can deploy KServe with Kourier .","title":"Deploy InferenceService with Alternative Networking Layer"},{"location":"admin/serverless/kourier_networking/#install-kourier-networking-layer","text":"Please refer to the Serverless Installation Guide and change the second step to install Kourier instead of Istio . Install the Kourier networking layer: kubectl apply -f https://github.com/knative/net-kourier/releases/download/ ${ KNATIVE_VERSION } /kourier.yaml Configure Knative Serving to use Kourier: kubectl patch configmap/config-network \\ --namespace knative-serving \\ --type merge \\ --patch '{\"data\":{\"ingress-class\":\"kourier.ingress.networking.knative.dev\"}}' Verify Kourier installation: kubectl get pods -n knative-serving && kubectl get pods -n kourier-system Expected Output NAME READY STATUS RESTARTS AGE activator-77db7d9dd7-kbrgr 1 /1 Running 0 10m autoscaler-67dbf79b95-htnp9 1 /1 Running 0 10m controller-684b6bc97f-ffm58 1 /1 Running 0 10m domain-mapping-6d99d99978-ktmrf 1 /1 Running 0 10m domainmapping-webhook-5f998498b6-sddnm 1 /1 Running 0 10m net-kourier-controller-68967d76dc-ncj2n 1 /1 Running 0 10m webhook-97bdc7b4d-nr7qf 1 /1 Running 0 10m NAME READY STATUS RESTARTS AGE 3scale-kourier-gateway-54c49c8ff5-x8tgn 1 /1 Running 0 10m Edit inferenceservice-config configmap to disable Istio top level virtual host: kubectl edit configmap/inferenceservice-config --namespace kserve # Add the flag `\"disableIstioVirtualHost\": true` under the ingress section ingress : | - { \"disableIstioVirtualHost\" : true } Restart the KServe Controller kubectl rollout restart deployment kserve-controller-manager -n kserve","title":"Install Kourier Networking Layer"},{"location":"admin/serverless/kourier_networking/#deploy-inferenceservice-for-testing-kourier-gateway","text":"","title":"Deploy InferenceService for Testing Kourier Gateway"},{"location":"admin/serverless/kourier_networking/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created","title":"Create the InferenceService"},{"location":"admin/serverless/kourier_networking/#run-a-prediction","text":"Note that when setting INGRESS_HOST and INGRESS_PORT following the determining the ingress IP and ports guide you need to replace istio-ingressgateway with kourier-gateway . For example if you choose to do Port Forward for testing you need to select the kourier-gateway pod as following. kubectl port-forward --namespace kourier-system \\ $( kubectl get pod -n kourier-system -l \"app=3scale-kourier-gateway\" --output = jsonpath = \"{.items[0].metadata.name}\" ) 8080 :8080 export INGRESS_HOST = localhost export INGRESS_PORT = 8080 Make sure that you create a file named pmml-input.json with the following content, under your current terminal path. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } Send a prediction request to the InferenceService and check the output. MODEL_NAME = pmml-demo INPUT_PATH = @./pmml-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo-predictor-default.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 144 < content-type: application/json ; charset = UTF-8 < date: Wed, 14 Sep 2022 13 :30:09 GMT < server: envoy < x-envoy-upstream-service-time: 58 < * Connection #0 to host localhost left intact { \"predictions\" : [{ \"Species\" : \"setosa\" , \"Probability_setosa\" : 1 .0, \"Probability_versicolor\" : 0 .0, \"Probability_virginica\" : 0 .0, \"Node_Id\" : \"2\" }]}","title":"Run a Prediction"},{"location":"admin/serverless/servicemesh/","text":"Macro Syntax Error \u00b6 File : admin/serverless/servicemesh/README.md Line 68 in Markdown file: Missing end of comment tag ### Disable Top Level Virtual Service {#disable-top-level-vs}","title":"Istio Service Mesh"},{"location":"admin/serverless/servicemesh/#macro-syntax-error","text":"File : admin/serverless/servicemesh/README.md Line 68 in Markdown file: Missing end of comment tag ### Disable Top Level Virtual Service {#disable-top-level-vs}","title":"Macro Syntax Error"},{"location":"api/api/","text":"KServe API \u00b6","title":"KServe API"},{"location":"api/api/#kserve-api","text":"","title":"KServe API"},{"location":"blog/_index/","text":"","title":" index"},{"location":"blog/articles/2021-09-27-kfserving-transition/","text":"Authors \u00b6 Dan Sun and Animesh Singh on behalf of the Kubeflow Serving Working Group KFServing is now KServe \u00b6 We are excited to announce the next chapter for KFServing. In coordination with the Kubeflow Project Steering Group, the KFServing GitHub repository has now been transferred to an independent KServe GitHub organization under the stewardship of the Kubeflow Serving Working Group leads. The project has been rebranded from KFServing to KServe , and we are planning to graduate the project from Kubeflow Project later this year. Developed collaboratively by Google, IBM, Bloomberg, NVIDIA, and Seldon in 2019, KFServing was published as open source in early 2019. The project sets out to provide the following features: - A simple, yet powerful, Kubernetes Custom Resource for deploying machine learning (ML) models on production across ML frameworks. - Provide performant, standardized inference protocol. - Serverless inference according to live traffic patterns, supporting \u201cScale-to-zero\u201d on both CPUs and GPUs. - Complete story for production ML Model Serving including prediction, pre/post-processing, explainability, and monitoring. - Support for deploying thousands of models at scale and inference graph capability for multiple models. KFServing was created to address the challenges of deploying and monitoring machine learning models on production for organizations. After publishing the open source project, we\u2019ve seen an explosion in demand for the software, leading to strong adoption and community growth. The scope of the project has since increased, and we have developed multiple components along the way, including our own growing body of documentation that needs it's own website and independent GitHub organization. What's Next \u00b6 Over the coming weeks, we will be releasing KServe 0.7 outside of the Kubeflow Project and will provide more details on how to migrate from KFServing to KServe with minimal disruptions. KFServing 0.5.x/0.6.x releases are still supported in next six months after KServe 0.7 release. We are also working on integrating core Kubeflow APIs and standards for the conformance program . For contributors, please follow the KServe developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users! KServe Key Links \u00b6 Website Github Slack(#kubeflow-kfserving) Contributor Acknowledgement \u00b6 We'd like to thank all the KServe contributors for this transition work! Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"KFserving Transition"},{"location":"blog/articles/2021-09-27-kfserving-transition/#authors","text":"Dan Sun and Animesh Singh on behalf of the Kubeflow Serving Working Group","title":"Authors"},{"location":"blog/articles/2021-09-27-kfserving-transition/#kfserving-is-now-kserve","text":"We are excited to announce the next chapter for KFServing. In coordination with the Kubeflow Project Steering Group, the KFServing GitHub repository has now been transferred to an independent KServe GitHub organization under the stewardship of the Kubeflow Serving Working Group leads. The project has been rebranded from KFServing to KServe , and we are planning to graduate the project from Kubeflow Project later this year. Developed collaboratively by Google, IBM, Bloomberg, NVIDIA, and Seldon in 2019, KFServing was published as open source in early 2019. The project sets out to provide the following features: - A simple, yet powerful, Kubernetes Custom Resource for deploying machine learning (ML) models on production across ML frameworks. - Provide performant, standardized inference protocol. - Serverless inference according to live traffic patterns, supporting \u201cScale-to-zero\u201d on both CPUs and GPUs. - Complete story for production ML Model Serving including prediction, pre/post-processing, explainability, and monitoring. - Support for deploying thousands of models at scale and inference graph capability for multiple models. KFServing was created to address the challenges of deploying and monitoring machine learning models on production for organizations. After publishing the open source project, we\u2019ve seen an explosion in demand for the software, leading to strong adoption and community growth. The scope of the project has since increased, and we have developed multiple components along the way, including our own growing body of documentation that needs it's own website and independent GitHub organization.","title":"KFServing is now KServe"},{"location":"blog/articles/2021-09-27-kfserving-transition/#whats-next","text":"Over the coming weeks, we will be releasing KServe 0.7 outside of the Kubeflow Project and will provide more details on how to migrate from KFServing to KServe with minimal disruptions. KFServing 0.5.x/0.6.x releases are still supported in next six months after KServe 0.7 release. We are also working on integrating core Kubeflow APIs and standards for the conformance program . For contributors, please follow the KServe developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users!","title":"What's Next"},{"location":"blog/articles/2021-09-27-kfserving-transition/#kserve-key-links","text":"Website Github Slack(#kubeflow-kfserving)","title":"KServe Key Links"},{"location":"blog/articles/2021-09-27-kfserving-transition/#contributor-acknowledgement","text":"We'd like to thank all the KServe contributors for this transition work! Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"Contributor Acknowledgement"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/","text":"Authors \u00b6 Dan Sun , Animesh Singh , Yuzhui Liu , Vedant Padwal on behalf of the KServe Working Group. KFServing is now KServe and KServe 0.7 release is available, the release also ensures a smooth user migration experience from KFServing to KServe. What's Changed? \u00b6 InferenceService API group is changed from serving.kubeflow.org to serving.kserve.io #1826 , the migration job is created for smooth transition. Python SDK name is changed from kfserving to kserve . KServe Installation manifests #1824 . Models-web-app is separated out of the kserve repository to models-web-app . Docs and examples are moved to separate repository website . KServe images are migrated to kserve docker hub account. v1alpha2 API group is deprecated #1850 . \ud83c\udf08 What's New? \u00b6 ModelMesh project is joining KServe under repository modelmesh-serving ! ModelMesh is designed for high-scale, high-density and frequently-changing model use cases. ModelMesh intelligently loads and unloads AI models to and from memory to strike an intelligent trade-off between responsiveness to users and computational footprint. To learn more about ModelMesh features and components, check out the ModelMesh announcement blog and Join talk at #KubeCon NA to get a deeper dive into ModelMesh and KServe . (Alpha feature) Raw Kubernetes deployment support, Istio/Knative dependency is now optional and please follow the guide to install and turn on RawDeployment mode. KServe now has its own documentation website temporarily hosted on website . Support v1 crd and webhook configuration for Kubernetes 1.22 #1837 . Triton model serving runtime now defaults to 21.09 version #1840 . \ud83d\udc1e What's Fixed? \u00b6 Bug fix for Azure blob storage #1845 . Tar/Zip support for all storage options #1836 . Fix AWS_REGION env variable and add AWS_CA_BUNDLE for S3 #1780 . Torchserve custom package install fix #1619 . Join the community \u00b6 Visit our Website or GitHub Join the Slack(#kubeflow-kfserving) Attend a Biweekly community meeting on Wednesday 9am PST Contribute at developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users! Contributors \u00b6 We would like to thank everyone for their efforts on v0.7 Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"KServe 0.7 Release"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#authors","text":"Dan Sun , Animesh Singh , Yuzhui Liu , Vedant Padwal on behalf of the KServe Working Group. KFServing is now KServe and KServe 0.7 release is available, the release also ensures a smooth user migration experience from KFServing to KServe.","title":"Authors"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-changed","text":"InferenceService API group is changed from serving.kubeflow.org to serving.kserve.io #1826 , the migration job is created for smooth transition. Python SDK name is changed from kfserving to kserve . KServe Installation manifests #1824 . Models-web-app is separated out of the kserve repository to models-web-app . Docs and examples are moved to separate repository website . KServe images are migrated to kserve docker hub account. v1alpha2 API group is deprecated #1850 .","title":"What's Changed?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-new","text":"ModelMesh project is joining KServe under repository modelmesh-serving ! ModelMesh is designed for high-scale, high-density and frequently-changing model use cases. ModelMesh intelligently loads and unloads AI models to and from memory to strike an intelligent trade-off between responsiveness to users and computational footprint. To learn more about ModelMesh features and components, check out the ModelMesh announcement blog and Join talk at #KubeCon NA to get a deeper dive into ModelMesh and KServe . (Alpha feature) Raw Kubernetes deployment support, Istio/Knative dependency is now optional and please follow the guide to install and turn on RawDeployment mode. KServe now has its own documentation website temporarily hosted on website . Support v1 crd and webhook configuration for Kubernetes 1.22 #1837 . Triton model serving runtime now defaults to 21.09 version #1840 .","title":"\ud83c\udf08 What's New?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-fixed","text":"Bug fix for Azure blob storage #1845 . Tar/Zip support for all storage options #1836 . Fix AWS_REGION env variable and add AWS_CA_BUNDLE for S3 #1780 . Torchserve custom package install fix #1619 .","title":"\ud83d\udc1e What's Fixed?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack(#kubeflow-kfserving) Attend a Biweekly community meeting on Wednesday 9am PST Contribute at developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users!","title":"Join the community"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#contributors","text":"We would like to thank everyone for their efforts on v0.7 Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"Contributors"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/","text":"Macro Syntax Error \u00b6 File : blog/articles/2022-02-18-KServe-0.8-release.md Line 67 in Markdown file: unexpected '.' - --model_name={{.Name}}","title":"KServe 0.8 Release"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#macro-syntax-error","text":"File : blog/articles/2022-02-18-KServe-0.8-release.md Line 67 in Markdown file: unexpected '.' - --model_name={{.Name}}","title":"Macro Syntax Error"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/","text":"Announcing: KServe v0.9.0 \u00b6 Today, we are pleased to announce the v0.9.0 release of KServe! KServe has now fully onboarded to LF AI & Data Foundation as an Incubation Project ! In this release we are excited to introduce the new InferenceGraph feature which has long been asked from the community. Also continuing the effort from the last release for unifying the InferenceService API for deploying models on KServe and ModelMesh, ModelMesh is now fully compatible with KServe InferenceService API! Introduce InferenceGraph \u00b6 The ML Inference system is getting bigger and more complex. It often consists of many models to make a single prediction. The common use cases are image classification and natural language multi-stage processing pipelines. For example, an image classification pipeline needs to run top level classification first then downstream further classification based on previous prediction results. KServe has the unique strength to build the distributed inference graph with its native integration of InferenceServices, standard inference protocol for chaining models and serverless auto-scaling capabilities. KServe leverages these strengths to build the InferenceGraph and enable users to deploy complex ML Inference pipelines to production in a declarative and scalable way. InferenceGraph is made up of a list of routing nodes with each node consisting of a set of routing steps. Each step can either route to an InferenceService or another node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of routing nodes: Sequence , Switch , Ensemble , Splitter . Sequence Node : It allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step and be passed to the next step as input based on configuration. Switch Node : It allows users to define routing conditions and select a Step to execute if it matches the condition. The response is returned as soon as it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combines the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : It allows users to split the traffic to multiple targets using a weighted distribution. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cat-dog-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"dog-breed-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/dog_breed_classification --- apiVersion : \"serving.kserve.io/v1alpha1\" kind : \"InferenceGraph\" metadata : name : \"dog-breed-pipeline\" spec : nodes : root : routerType : Sequence steps : - serviceName : cat-dog-classifier name : cat_dog_classifier # step name - serviceName : dog-breed-classifier name : dog_breed_classifier data : $request condition : \"[@this].#(predictions.0==\\\"dog\\\")\" Currently InferenceGraph is supported with the Serverless deployment mode. You can try it out following the tutorial . InferenceService API for ModelMesh \u00b6 The InferenceService CRD is now the primary interface for interacting with ModelMesh. Some changes were made to the InferenceService spec to better facilitate ModelMesh\u2019s needs. Storage Spec \u00b6 To unify how model storage is defined for both single and multi-model serving, a new storage spec was added to the predictor model spec. With this storage spec, users can specify a key inside a common secret holding config/credentials for each of the storage backends from which models can be loaded. Example: storage : key : localMinIO # Credential key for the destination storage in the common secret path : sklearn # Model path inside the bucket # schemaPath: null # Optional schema files for payload schema parameters : # Parameters to override the default values inside the common secret. bucket : example-models Learn more here . Model Status \u00b6 For further alignment between ModelMesh and KServe, some additions to the InferenceService status were made. There is now a Model Status section which contains information about the model loaded in the predictor. New fields include: states - State information of the predictor's model. activeModelState - The state of the model currently being served by the predictor's endpoints. targetModelState - This will be set only when transitionStatus is not UpToDate , meaning that the target model differs from the currently-active model. transitionStatus - Indicates state of the predictor relative to its current spec. modelCopies - Model copy information of the predictor's model. lastFailureInfo - Details about the most recent error associated with this predictor. Not all of the contained fields will necessarily have a value. Deploying on ModelMesh \u00b6 For deploying InferenceServices on ModelMesh, the ModelMesh and KServe controllers will still require that the user specifies the serving.kserve.io/deploymentMode: ModelMesh annotation. A complete example on an InferenceService with the new storage spec is showing below: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-tensorflow-mnist annotations : serving.kserve.io/deploymentMode : ModelMesh spec : predictor : model : modelFormat : name : tensorflow storage : key : localMinIO path : tensorflow/mnist.savedmodel Other New Features: \u00b6 Support serving MLFlow model format via MLServer serving runtime. Support unified autoscaling target and metric fields for InferenceService components with both Serverless and RawDeployment mode. Support InferenceService ingress class and url domain template configuration for RawDeployment mode. ModelMesh now has a default OpenVINO Model Server ServingRuntime. What\u2019s Changed? \u00b6 The KServe controller manager is changed from StatefulSet to Deployment to support HA mode. log4j security vulnerability fix Upgrade TorchServe serving runtime to 0.6.0 Update MLServer serving runtime to 1.0.0 Check out the full release notes for KServe and ModelMesh for more details. Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thank you for contributing or checking out KServe! \u2013 The KServe Working Group","title":"KServe 0.9 Release"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#announcing-kserve-v090","text":"Today, we are pleased to announce the v0.9.0 release of KServe! KServe has now fully onboarded to LF AI & Data Foundation as an Incubation Project ! In this release we are excited to introduce the new InferenceGraph feature which has long been asked from the community. Also continuing the effort from the last release for unifying the InferenceService API for deploying models on KServe and ModelMesh, ModelMesh is now fully compatible with KServe InferenceService API!","title":"Announcing: KServe v0.9.0"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#introduce-inferencegraph","text":"The ML Inference system is getting bigger and more complex. It often consists of many models to make a single prediction. The common use cases are image classification and natural language multi-stage processing pipelines. For example, an image classification pipeline needs to run top level classification first then downstream further classification based on previous prediction results. KServe has the unique strength to build the distributed inference graph with its native integration of InferenceServices, standard inference protocol for chaining models and serverless auto-scaling capabilities. KServe leverages these strengths to build the InferenceGraph and enable users to deploy complex ML Inference pipelines to production in a declarative and scalable way. InferenceGraph is made up of a list of routing nodes with each node consisting of a set of routing steps. Each step can either route to an InferenceService or another node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of routing nodes: Sequence , Switch , Ensemble , Splitter . Sequence Node : It allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step and be passed to the next step as input based on configuration. Switch Node : It allows users to define routing conditions and select a Step to execute if it matches the condition. The response is returned as soon as it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combines the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : It allows users to split the traffic to multiple targets using a weighted distribution. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cat-dog-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"dog-breed-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/dog_breed_classification --- apiVersion : \"serving.kserve.io/v1alpha1\" kind : \"InferenceGraph\" metadata : name : \"dog-breed-pipeline\" spec : nodes : root : routerType : Sequence steps : - serviceName : cat-dog-classifier name : cat_dog_classifier # step name - serviceName : dog-breed-classifier name : dog_breed_classifier data : $request condition : \"[@this].#(predictions.0==\\\"dog\\\")\" Currently InferenceGraph is supported with the Serverless deployment mode. You can try it out following the tutorial .","title":"Introduce InferenceGraph"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#inferenceservice-api-for-modelmesh","text":"The InferenceService CRD is now the primary interface for interacting with ModelMesh. Some changes were made to the InferenceService spec to better facilitate ModelMesh\u2019s needs.","title":"InferenceService API for ModelMesh"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#storage-spec","text":"To unify how model storage is defined for both single and multi-model serving, a new storage spec was added to the predictor model spec. With this storage spec, users can specify a key inside a common secret holding config/credentials for each of the storage backends from which models can be loaded. Example: storage : key : localMinIO # Credential key for the destination storage in the common secret path : sklearn # Model path inside the bucket # schemaPath: null # Optional schema files for payload schema parameters : # Parameters to override the default values inside the common secret. bucket : example-models Learn more here .","title":"Storage Spec"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#model-status","text":"For further alignment between ModelMesh and KServe, some additions to the InferenceService status were made. There is now a Model Status section which contains information about the model loaded in the predictor. New fields include: states - State information of the predictor's model. activeModelState - The state of the model currently being served by the predictor's endpoints. targetModelState - This will be set only when transitionStatus is not UpToDate , meaning that the target model differs from the currently-active model. transitionStatus - Indicates state of the predictor relative to its current spec. modelCopies - Model copy information of the predictor's model. lastFailureInfo - Details about the most recent error associated with this predictor. Not all of the contained fields will necessarily have a value.","title":"Model Status"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#deploying-on-modelmesh","text":"For deploying InferenceServices on ModelMesh, the ModelMesh and KServe controllers will still require that the user specifies the serving.kserve.io/deploymentMode: ModelMesh annotation. A complete example on an InferenceService with the new storage spec is showing below: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-tensorflow-mnist annotations : serving.kserve.io/deploymentMode : ModelMesh spec : predictor : model : modelFormat : name : tensorflow storage : key : localMinIO path : tensorflow/mnist.savedmodel","title":"Deploying on ModelMesh"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#other-new-features","text":"Support serving MLFlow model format via MLServer serving runtime. Support unified autoscaling target and metric fields for InferenceService components with both Serverless and RawDeployment mode. Support InferenceService ingress class and url domain template configuration for RawDeployment mode. ModelMesh now has a default OpenVINO Model Server ServingRuntime.","title":"Other New Features:"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#whats-changed","text":"The KServe controller manager is changed from StatefulSet to Deployment to support HA mode. log4j security vulnerability fix Upgrade TorchServe serving runtime to 0.6.0 Update MLServer serving runtime to 1.0.0 Check out the full release notes for KServe and ModelMesh for more details.","title":"What\u2019s Changed?"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thank you for contributing or checking out KServe! \u2013 The KServe Working Group","title":"Join the community"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/","text":"Announcing: KServe v0.10.0 \u00b6 We are excited to announce KServe 0.10 release. In this release we have enabled more KServe networking options, improved KServe telemetry for supported serving runtimes and increased support coverage for Open(aka v2) inference protocol for both standard and ModelMesh InferenceService. KServe Networking Options \u00b6 Istio is now optional for both Serverless and RawDeployment mode. Please see the alternative networking guide for how you can enable other ingress options supported by Knative with Serverless mode. For Istio users, if you want to turn on full service mesh mode to secure InferenceService with mutual TLS and enable the traffic policies, please read the service mesh setup guideline . KServe Telemetry for Serving Runtimes \u00b6 We have instrumented additional latency metrics in KServe Python ServingRuntimes for preprocess , predict and postprocess handlers. In Serverless mode we have extended Knative queue-proxy to enable metrics aggregation for both metrics exposed in queue-proxy and kserve-container from each ServingRuntime . Please read the prometheus metrics setup guideline for how to enable the metrics scraping and aggregations. Open(v2) Inference Protocol Support Coverage \u00b6 As there have been increasing adoptions for KServe v2 Inference Protocol from AMD Inference ServingRuntime which supports FPGAs and OpenVINO which now provides KServe REST and gRPC compatible API, in the issue we have proposed to rename to KServe Open Inference Protocol . In KServe 0.10, we have added Open(v2) inference protocol support for KServe custom runtimes. Now, you can enable v2 REST/gRPC for both custom transformer and predictor with images built by implementing KServe Python SDK API. gRPC enables high performance inference data plane as it is built on top of HTTP/2 and binary data transportation which is more efficient to send over the wire compared to REST. Please see the detailed example for transformer and predictor . from kserve import Model def image_transform ( byte_array ): image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor class CustomModel ( Model ): def predict ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferResponse : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) output = self . model ( input_tensors ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () infer_output = InferOutput ( name = \"output-0\" , shape = list ( values . shape ), datatype = \"FP32\" , data = result ) infer_response = InferResponse ( model_name = self . name , infer_outputs = [ infer_output ], response_id = response_id ) return infer_response class CustomTransformer ( Model ): def preprocess ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request You can use the same Python API type InferRequest and InferResponse for both REST and gRPC protocol. KServe handles the underlying decoding and encoding according to the protocol. Warning A new headers argument is added to the custom handlers to pass http/gRPC headers or other metadata. You can also use this as context dict to pass data between handlers. If you have existing custom transformer or predictor, the headers argument is now required to add to the preprocess , predict and postprocess handlers. Please check the following matrix for supported ModelFormats and ServingRuntimes . Model Format v1 Open(v2) REST/gRPC Tensorflow \u2705 TFServing \u2705 Triton PyTorch \u2705 TorchServe \u2705 TorchServe TorchScript \u2705 TorchServe \u2705 Triton ONNX \u274c \u2705 Triton Scikit-learn \u2705 KServe \u2705 MLServer XGBoost \u2705 KServe \u2705 MLServer LightGBM \u2705 KServe \u2705 MLServer MLFlow \u274c \u2705 MLServer Custom \u2705 KServe \u2705 KServe Multi-Arch Image Support \u00b6 KServe control plane images kserve-controller , kserve/agent , kserve/router are now supported for multiple architectures: ppc64le , arm64 , amd64 , s390x . KServe Storage Credentials Support \u00b6 Currently, AWS users need to create a secret with long term/static IAM credentials for downloading models stored in S3. Security best practice is to use IAM role for service account(IRSA) which enables automatic credential rotation and fine-grained access control, see how to setup IRSA . Support Azure Blobs with managed identity . ModelMesh updates \u00b6 ModelMesh has continued to integrate itself as KServe's multi-model serving backend, introducing improvements and features that better align the two projects. For example, it now supports ClusterServingRuntimes, allowing use of cluster-scoped ServingRuntimes, originally introduced in KServe 0.8. Additionally, ModelMesh introduced support for TorchServe enabling users to serve arbitrary PyTorch models (e.g. eager-mode) in the context of distributed-multi-model serving. Other limitations have been addressed as well, such as adding support for BYTES/string type tensors when using the REST inference API for inference requests that require them. Other Changes: \u00b6 For a complete change list please read the release notes from KServe v0.10 and ModelMesh v0.10 . Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.10 release! Steve Larkin Stephan Schielke Curtis Maddalozzo Zhongcheng Lao Dimitris Aragiorgis Pan Li tjandy98 Sukumar Gaonkar Rachit Chauhan Rafael Vasquez Tim Kleinloog Christian Kadner ddelange Lize Cai sangjune.park Suresh Nakkeran Konstantinos Messis Matt Rose Alexa Griffith Jagadeesh J Alex Lembiyeuski Yuki Iwai Andrews Arokiam Xin Fu adilhusain-s Pranav Pandit C1berwiz dilverse Yuan Tang Dan Sun Nick Hill The KServe Working Group","title":"KServe 0.10 Release"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#announcing-kserve-v0100","text":"We are excited to announce KServe 0.10 release. In this release we have enabled more KServe networking options, improved KServe telemetry for supported serving runtimes and increased support coverage for Open(aka v2) inference protocol for both standard and ModelMesh InferenceService.","title":"Announcing: KServe v0.10.0"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-networking-options","text":"Istio is now optional for both Serverless and RawDeployment mode. Please see the alternative networking guide for how you can enable other ingress options supported by Knative with Serverless mode. For Istio users, if you want to turn on full service mesh mode to secure InferenceService with mutual TLS and enable the traffic policies, please read the service mesh setup guideline .","title":"KServe Networking Options"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-telemetry-for-serving-runtimes","text":"We have instrumented additional latency metrics in KServe Python ServingRuntimes for preprocess , predict and postprocess handlers. In Serverless mode we have extended Knative queue-proxy to enable metrics aggregation for both metrics exposed in queue-proxy and kserve-container from each ServingRuntime . Please read the prometheus metrics setup guideline for how to enable the metrics scraping and aggregations.","title":"KServe Telemetry for Serving Runtimes"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#openv2-inference-protocol-support-coverage","text":"As there have been increasing adoptions for KServe v2 Inference Protocol from AMD Inference ServingRuntime which supports FPGAs and OpenVINO which now provides KServe REST and gRPC compatible API, in the issue we have proposed to rename to KServe Open Inference Protocol . In KServe 0.10, we have added Open(v2) inference protocol support for KServe custom runtimes. Now, you can enable v2 REST/gRPC for both custom transformer and predictor with images built by implementing KServe Python SDK API. gRPC enables high performance inference data plane as it is built on top of HTTP/2 and binary data transportation which is more efficient to send over the wire compared to REST. Please see the detailed example for transformer and predictor . from kserve import Model def image_transform ( byte_array ): image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor class CustomModel ( Model ): def predict ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferResponse : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) output = self . model ( input_tensors ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () infer_output = InferOutput ( name = \"output-0\" , shape = list ( values . shape ), datatype = \"FP32\" , data = result ) infer_response = InferResponse ( model_name = self . name , infer_outputs = [ infer_output ], response_id = response_id ) return infer_response class CustomTransformer ( Model ): def preprocess ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request You can use the same Python API type InferRequest and InferResponse for both REST and gRPC protocol. KServe handles the underlying decoding and encoding according to the protocol. Warning A new headers argument is added to the custom handlers to pass http/gRPC headers or other metadata. You can also use this as context dict to pass data between handlers. If you have existing custom transformer or predictor, the headers argument is now required to add to the preprocess , predict and postprocess handlers. Please check the following matrix for supported ModelFormats and ServingRuntimes . Model Format v1 Open(v2) REST/gRPC Tensorflow \u2705 TFServing \u2705 Triton PyTorch \u2705 TorchServe \u2705 TorchServe TorchScript \u2705 TorchServe \u2705 Triton ONNX \u274c \u2705 Triton Scikit-learn \u2705 KServe \u2705 MLServer XGBoost \u2705 KServe \u2705 MLServer LightGBM \u2705 KServe \u2705 MLServer MLFlow \u274c \u2705 MLServer Custom \u2705 KServe \u2705 KServe","title":"Open(v2) Inference Protocol Support Coverage"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#multi-arch-image-support","text":"KServe control plane images kserve-controller , kserve/agent , kserve/router are now supported for multiple architectures: ppc64le , arm64 , amd64 , s390x .","title":"Multi-Arch Image Support"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-storage-credentials-support","text":"Currently, AWS users need to create a secret with long term/static IAM credentials for downloading models stored in S3. Security best practice is to use IAM role for service account(IRSA) which enables automatic credential rotation and fine-grained access control, see how to setup IRSA . Support Azure Blobs with managed identity .","title":"KServe Storage Credentials Support"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#modelmesh-updates","text":"ModelMesh has continued to integrate itself as KServe's multi-model serving backend, introducing improvements and features that better align the two projects. For example, it now supports ClusterServingRuntimes, allowing use of cluster-scoped ServingRuntimes, originally introduced in KServe 0.8. Additionally, ModelMesh introduced support for TorchServe enabling users to serve arbitrary PyTorch models (e.g. eager-mode) in the context of distributed-multi-model serving. Other limitations have been addressed as well, such as adding support for BYTES/string type tensors when using the REST inference API for inference requests that require them.","title":"ModelMesh updates"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#other-changes","text":"For a complete change list please read the release notes from KServe v0.10 and ModelMesh v0.10 .","title":"Other Changes:"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.10 release! Steve Larkin Stephan Schielke Curtis Maddalozzo Zhongcheng Lao Dimitris Aragiorgis Pan Li tjandy98 Sukumar Gaonkar Rachit Chauhan Rafael Vasquez Tim Kleinloog Christian Kadner ddelange Lize Cai sangjune.park Suresh Nakkeran Konstantinos Messis Matt Rose Alexa Griffith Jagadeesh J Alex Lembiyeuski Yuki Iwai Andrews Arokiam Xin Fu adilhusain-s Pranav Pandit C1berwiz dilverse Yuan Tang Dan Sun Nick Hill The KServe Working Group","title":"Join the community"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/","text":"Announcing: KServe v0.11 \u00b6 We are excited to announce the release of KServe 0.11, in this release we introduced Large Language Model (LLM) runtimes, made enhancements to the KServe control plane, Python SDK Open Inference Protocol support and dependency managemenet. For ModelMesh we have added features PVC, HPA, payload logging to ensure feature parity with KServe. Here is a summary of the key changes: KServe Core Inference Enhancements \u00b6 Support path based routing which is served as an alternative way to the host based routing, the URL of the InferenceService could look like http://<ingress_domain>/serving/<namespace>/<isvc_name> . Please refer to the doc for how to enable path based routing. Introduced priority field for Serving Runtime custom resource to handle the case when you have multiple serving runtimes which support the same model formats, see more details from the serving runtime doc . Introduced Custom Storage Container CRD to allow customized implementations with supported storage URI prefixes, example use cases are private model registry integration: apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/model-registry:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : model-registry:// Inference Graph enhancements for improving the API spec to support pod affinity and resource requirement fields. Dependency field with options Soft and Hard is introduced to handle error responses from the inference steps to decide whether to short-circuit the request in case of errors, see the following example with hard dependency with the node steps: apiVersion : serving.kserve.io/v1alpha1 kind : InferenceGraph metadata : name : graph_with_switch_node spec : nodes : root : routerType : Sequence steps : - name : \"rootStep1\" nodeName : node1 dependency : Hard - name : \"rootStep2\" serviceName : {{ success_200_isvc_id }} node1 : routerType : Switch steps : - name : \"node1Step1\" serviceName : {{ error_404_isvc_id }} condition : \"[@this].#(decision_picker==ERROR)\" dependency : Hard For more details please refer to the issue . Improved InferenceService debugging experience by adding the aggregated RoutesReady status and LastDeploymentReady condition to the InferenceService Status to differentiate the endpoint and deployment status. This applies to the serverless mode and for more details refer to the API docs . Enhanced Python SDK Dependency Management \u00b6 KServe has adopted poetry to manage python dependencies. You can now install the KServe SDK with locked dependencies using poetry install . While pip install still works, we highly recommend using poetry to ensure predictable dependency management. The KServe SDK is also slimmed down by making the cloud storage dependency optional, if you require storage dependency for custom serving runtimes you can still install with pip install kserve[storage] . KServe Python Runtimes Improvements \u00b6 KServe Python Runtimes including sklearnserver , lgbserver , xgbserver now support the open inference protocol for both REST and gRPC. Logging improvements including adding Uvicorn access logging and a default KServe logger. Postprocess handler has been aligned with open inference protocol, simplifying the underlying transportation protocol complexities. LLM Runtimes \u00b6 TorchServe LLM Runtime \u00b6 KServe now integrates with TorchServe 0.8, offering the support for LLM models that may not fit onto a single GPU. Huggingface Accelerate and Deepspeed are available options to split the model into multiple partitions over multiple GPUs. You can see the detailed example for how to serve the LLM on KServe with TorchServe runtime. vLLM Runtime \u00b6 Serving LLM models can be surprisingly slow even on high end GPUs, vLLM is a fast and easy-to-use LLM inference engine. It can achieve 10x-20x higher throughput than Huggingface transformers. It supports continuous batching for increased throughput and GPU utilization, paged attention to address the memory bottleneck where in the autoregressive decoding process all the attention key value tensors(KV Cache) are kept in the GPU memory to generate next tokens. In the example we show how to deploy vLLM on KServe and expects further integration in KServe 0.12 with proposed generate endpoint for open inference protocol. ModelMesh Updates \u00b6 Storing Models on Kubernetes Persistent Volumes (PVC) \u00b6 ModelMesh now allows to directly mount model files onto serving runtimes pods using Kubernetes Persistent Volumes . Depending on the selected storage solution this approach can significantly reduce latency when deploying new predictors, potentially remove the need for additional S3 cloud object storage like AWS S3, GCS, or Azure Blob Storage altogether. Horizontal Pod Autoscaling (HPA) \u00b6 Kubernetes Horizontal Pod Autoscaling can now be used at the serving runtime pod level. With HPA enabled, the ModelMesh controller no longer manages the number of replicas. Instead, a HorizontalPodAutoscaler automatically updates the serving runtime deployment with the number of Pods to best match the demand. Model Metrics, Metrics Dashboard, Payload Event Logging \u00b6 ModelMesh v0.11 introduces a new configuration option to emit a subset of useful metrics at the individual model level. These metrics can help identify outlier or \"heavy hitter\" models and consequently fine-tune the deployments of those inference services, like allocating more resources or increasing the number of replicas for improved responsiveness or avoid frequent cache misses. A new Grafana dashboard was added to display the comprehensive set of Prometheus metrics like model loading and unloading rates, internal queuing delays, capacity and usage, cache state, etc. to monitor the general health of the ModelMesh Serving deployment. The new PayloadProcessor interface can be implemented to log prediction requests and responses, to create data sinks for data visualization, for model quality assessment, or for drift and outlier detection by external monitoring systems. What's Changed? \u00b6 To allow longer InferenceService name due to DNS max length limits from issue , the Default suffix in the inference service component(predictor/transformer/explainer) name has been removed for newly created InferenceServices. This affects the client that is using the component url directly instead of the top level InferenceService url. Status.address.url is now consistent for both serverless and raw deployment mode, the url path portion is dropped in serverless mode. Raw bytes are now accepted in v1 protocol, setting the right content-type header to application/json is required to recognize and decode the json payload if content-type is specified. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json For a complete change list please read the release notes from KServe v0.11 and ModelMesh v0.11 . Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.11 release! The KServe Working Group","title":"KServe 0.11 Release"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#announcing-kserve-v011","text":"We are excited to announce the release of KServe 0.11, in this release we introduced Large Language Model (LLM) runtimes, made enhancements to the KServe control plane, Python SDK Open Inference Protocol support and dependency managemenet. For ModelMesh we have added features PVC, HPA, payload logging to ensure feature parity with KServe. Here is a summary of the key changes:","title":"Announcing: KServe v0.11"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#kserve-core-inference-enhancements","text":"Support path based routing which is served as an alternative way to the host based routing, the URL of the InferenceService could look like http://<ingress_domain>/serving/<namespace>/<isvc_name> . Please refer to the doc for how to enable path based routing. Introduced priority field for Serving Runtime custom resource to handle the case when you have multiple serving runtimes which support the same model formats, see more details from the serving runtime doc . Introduced Custom Storage Container CRD to allow customized implementations with supported storage URI prefixes, example use cases are private model registry integration: apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/model-registry:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : model-registry:// Inference Graph enhancements for improving the API spec to support pod affinity and resource requirement fields. Dependency field with options Soft and Hard is introduced to handle error responses from the inference steps to decide whether to short-circuit the request in case of errors, see the following example with hard dependency with the node steps: apiVersion : serving.kserve.io/v1alpha1 kind : InferenceGraph metadata : name : graph_with_switch_node spec : nodes : root : routerType : Sequence steps : - name : \"rootStep1\" nodeName : node1 dependency : Hard - name : \"rootStep2\" serviceName : {{ success_200_isvc_id }} node1 : routerType : Switch steps : - name : \"node1Step1\" serviceName : {{ error_404_isvc_id }} condition : \"[@this].#(decision_picker==ERROR)\" dependency : Hard For more details please refer to the issue . Improved InferenceService debugging experience by adding the aggregated RoutesReady status and LastDeploymentReady condition to the InferenceService Status to differentiate the endpoint and deployment status. This applies to the serverless mode and for more details refer to the API docs .","title":"KServe Core Inference Enhancements"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#enhanced-python-sdk-dependency-management","text":"KServe has adopted poetry to manage python dependencies. You can now install the KServe SDK with locked dependencies using poetry install . While pip install still works, we highly recommend using poetry to ensure predictable dependency management. The KServe SDK is also slimmed down by making the cloud storage dependency optional, if you require storage dependency for custom serving runtimes you can still install with pip install kserve[storage] .","title":"Enhanced Python SDK Dependency Management"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#kserve-python-runtimes-improvements","text":"KServe Python Runtimes including sklearnserver , lgbserver , xgbserver now support the open inference protocol for both REST and gRPC. Logging improvements including adding Uvicorn access logging and a default KServe logger. Postprocess handler has been aligned with open inference protocol, simplifying the underlying transportation protocol complexities.","title":"KServe Python Runtimes Improvements"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#llm-runtimes","text":"","title":"LLM Runtimes"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#torchserve-llm-runtime","text":"KServe now integrates with TorchServe 0.8, offering the support for LLM models that may not fit onto a single GPU. Huggingface Accelerate and Deepspeed are available options to split the model into multiple partitions over multiple GPUs. You can see the detailed example for how to serve the LLM on KServe with TorchServe runtime.","title":"TorchServe LLM Runtime"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#vllm-runtime","text":"Serving LLM models can be surprisingly slow even on high end GPUs, vLLM is a fast and easy-to-use LLM inference engine. It can achieve 10x-20x higher throughput than Huggingface transformers. It supports continuous batching for increased throughput and GPU utilization, paged attention to address the memory bottleneck where in the autoregressive decoding process all the attention key value tensors(KV Cache) are kept in the GPU memory to generate next tokens. In the example we show how to deploy vLLM on KServe and expects further integration in KServe 0.12 with proposed generate endpoint for open inference protocol.","title":"vLLM Runtime"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#modelmesh-updates","text":"","title":"ModelMesh Updates"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#storing-models-on-kubernetes-persistent-volumes-pvc","text":"ModelMesh now allows to directly mount model files onto serving runtimes pods using Kubernetes Persistent Volumes . Depending on the selected storage solution this approach can significantly reduce latency when deploying new predictors, potentially remove the need for additional S3 cloud object storage like AWS S3, GCS, or Azure Blob Storage altogether.","title":"Storing Models on Kubernetes Persistent Volumes (PVC)"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#horizontal-pod-autoscaling-hpa","text":"Kubernetes Horizontal Pod Autoscaling can now be used at the serving runtime pod level. With HPA enabled, the ModelMesh controller no longer manages the number of replicas. Instead, a HorizontalPodAutoscaler automatically updates the serving runtime deployment with the number of Pods to best match the demand.","title":"Horizontal Pod Autoscaling (HPA)"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#model-metrics-metrics-dashboard-payload-event-logging","text":"ModelMesh v0.11 introduces a new configuration option to emit a subset of useful metrics at the individual model level. These metrics can help identify outlier or \"heavy hitter\" models and consequently fine-tune the deployments of those inference services, like allocating more resources or increasing the number of replicas for improved responsiveness or avoid frequent cache misses. A new Grafana dashboard was added to display the comprehensive set of Prometheus metrics like model loading and unloading rates, internal queuing delays, capacity and usage, cache state, etc. to monitor the general health of the ModelMesh Serving deployment. The new PayloadProcessor interface can be implemented to log prediction requests and responses, to create data sinks for data visualization, for model quality assessment, or for drift and outlier detection by external monitoring systems.","title":"Model Metrics, Metrics Dashboard, Payload Event Logging"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#whats-changed","text":"To allow longer InferenceService name due to DNS max length limits from issue , the Default suffix in the inference service component(predictor/transformer/explainer) name has been removed for newly created InferenceServices. This affects the client that is using the component url directly instead of the top level InferenceService url. Status.address.url is now consistent for both serverless and raw deployment mode, the url path portion is dropped in serverless mode. Raw bytes are now accepted in v1 protocol, setting the right content-type header to application/json is required to recognize and decode the json payload if content-type is specified. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json For a complete change list please read the release notes from KServe v0.11 and ModelMesh v0.11 .","title":"What's Changed?"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.11 release! The KServe Working Group","title":"Join the community"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/","text":"From Serverless Predictive Inference to Generative Inference: Introducing KServe v0.13 \u00b6 We are excited to unveil KServe v0.13, marking a significant leap forward in evolving cloud native model serving to meet the demands of Generative AI inference. This release is highlighted by three pivotal updates: enhanced Hugging Face runtime, robust vLLM backend support for Generative Models, and the integration of OpenAI protocol standards. Below are a summary of the key changes. Enhanced Hugging Face Runtime Support \u00b6 KServe v0.13 enriches its Hugging Face runtime and now supports running Hugging Face models out-of-the-box. KServe v0.13 implements a KServe Hugging Face Serving Runtime , kserve-huggingfaceserver . With this implementation, KServe can now automatically infer a task from model architecture and select the optimized serving runtime. Currently supported tasks include sequence classification, token classification, fill mask, text generation, and text to text generation. Here is an example to serve BERT model by deploying an Inference Service with Hugging Face runtime for classification task. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=bert-base-uncased - --tensor_input_names=input_ids resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : 100m memory : 2Gi nvidia.com/gpu : \"1\" You can also deploy BERT on the more optimized inference runtime like Triton using Hugging Face Runtime for pre/post processing, see more details here . vLLM support \u00b6 Version 0.13 introduces dedicated runtime support for vLLM , for enhanced transformer model serving. This support now includes auto-mapping vLLMs as the backend for supported tasks, streamlining the deployment process and optimizing performance. If vLLM does not support a particular task, it will default to the Hugging Face backend. See example below. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" See more details in our updated docs to Deploy the Llama3 model with Hugging Face LLM Serving Runtime . Additionally, if the Hugging Face backend is preferred over vLLM, vLLM auto-mapping can be disabled with the --backend=huggingface arg. OpenAI Schema Integration \u00b6 Embracing the OpenAI protocol, KServe v0.13 now supports three specific endpoints for generative transformer models: /openai/v1/completions /openai/v1/chat/completions /openai/v1/models These endpoints are useful for generative transformer models, which take in messages and return a model-generated message output. The chat completions endpoint is designed for easily handling multi-turn conversations, while still being useful for single-turn tasks. The completions endpoint is now a legacy endpoint that differs with the chat completions endpoint in that the interface for completions is a freeform text string called a prompt . Read more about the chat completions and completions endpoints int the OpenAI API docs. This update fosters a standardized approach to transformer model serving, ensuring compatibility with a broader spectrum of models and tools, and enhances the platform's versatility. The API can be directly used with OpenAI's client libraries or third-party tools, like LangChain or LlamaIndex. Future Plan \u00b6 Support other tasks like text embeddings #3572 . Support more LLM backend options in the future, such as TensorRT-LLM. Enrich text generation metrics for Throughput(tokens/sec), TTFT(Time to first token) #3461 . KEDA integration for token based LLM Autoscaling #3561 . Other Changes \u00b6 This release also includes several enhancements and changes: What's New? \u00b6 Async streaming support for v1 endpoints #3402 . Support for .json and .ubj model formats in the XGBoost server image #3546 . Enhanced flexibility in KServe by allowing the configuration of multiple domains for an inference service #2747 . Enhanced the manager setup to dynamically adapt based on available CRDs, improving operational flexibility and reliability across different deployment environments #3470 . What's Changed? \u00b6 Removed Seldon Alibi dependency #3380 . Removal of conversion webhook from manifests. #3344 . For complete details on the new features and updates, visit our official release notes . Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.13 release! The KServe Project","title":"KServe 0.13 Release"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#from-serverless-predictive-inference-to-generative-inference-introducing-kserve-v013","text":"We are excited to unveil KServe v0.13, marking a significant leap forward in evolving cloud native model serving to meet the demands of Generative AI inference. This release is highlighted by three pivotal updates: enhanced Hugging Face runtime, robust vLLM backend support for Generative Models, and the integration of OpenAI protocol standards. Below are a summary of the key changes.","title":"From Serverless Predictive Inference to Generative Inference: Introducing KServe v0.13"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#enhanced-hugging-face-runtime-support","text":"KServe v0.13 enriches its Hugging Face runtime and now supports running Hugging Face models out-of-the-box. KServe v0.13 implements a KServe Hugging Face Serving Runtime , kserve-huggingfaceserver . With this implementation, KServe can now automatically infer a task from model architecture and select the optimized serving runtime. Currently supported tasks include sequence classification, token classification, fill mask, text generation, and text to text generation. Here is an example to serve BERT model by deploying an Inference Service with Hugging Face runtime for classification task. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=bert-base-uncased - --tensor_input_names=input_ids resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : 100m memory : 2Gi nvidia.com/gpu : \"1\" You can also deploy BERT on the more optimized inference runtime like Triton using Hugging Face Runtime for pre/post processing, see more details here .","title":"Enhanced Hugging Face Runtime Support"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#vllm-support","text":"Version 0.13 introduces dedicated runtime support for vLLM , for enhanced transformer model serving. This support now includes auto-mapping vLLMs as the backend for supported tasks, streamlining the deployment process and optimizing performance. If vLLM does not support a particular task, it will default to the Hugging Face backend. See example below. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" See more details in our updated docs to Deploy the Llama3 model with Hugging Face LLM Serving Runtime . Additionally, if the Hugging Face backend is preferred over vLLM, vLLM auto-mapping can be disabled with the --backend=huggingface arg.","title":"vLLM support"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#openai-schema-integration","text":"Embracing the OpenAI protocol, KServe v0.13 now supports three specific endpoints for generative transformer models: /openai/v1/completions /openai/v1/chat/completions /openai/v1/models These endpoints are useful for generative transformer models, which take in messages and return a model-generated message output. The chat completions endpoint is designed for easily handling multi-turn conversations, while still being useful for single-turn tasks. The completions endpoint is now a legacy endpoint that differs with the chat completions endpoint in that the interface for completions is a freeform text string called a prompt . Read more about the chat completions and completions endpoints int the OpenAI API docs. This update fosters a standardized approach to transformer model serving, ensuring compatibility with a broader spectrum of models and tools, and enhances the platform's versatility. The API can be directly used with OpenAI's client libraries or third-party tools, like LangChain or LlamaIndex.","title":"OpenAI Schema Integration"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#future-plan","text":"Support other tasks like text embeddings #3572 . Support more LLM backend options in the future, such as TensorRT-LLM. Enrich text generation metrics for Throughput(tokens/sec), TTFT(Time to first token) #3461 . KEDA integration for token based LLM Autoscaling #3561 .","title":"Future Plan"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#other-changes","text":"This release also includes several enhancements and changes:","title":"Other Changes"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#whats-new","text":"Async streaming support for v1 endpoints #3402 . Support for .json and .ubj model formats in the XGBoost server image #3546 . Enhanced flexibility in KServe by allowing the configuration of multiple domains for an inference service #2747 . Enhanced the manager setup to dynamically adapt based on available CRDs, improving operational flexibility and reliability across different deployment environments #3470 .","title":"What's New?"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#whats-changed","text":"Removed Seldon Alibi dependency #3380 . Removal of conversion webhook from manifests. #3344 . For complete details on the new features and updates, visit our official release notes .","title":"What's Changed?"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.13 release! The KServe Project","title":"Join the community"},{"location":"blog/articles/_index/","text":"","title":" index"},{"location":"community/adopters/","text":"Adopters of KServe \u00b6 This page contains a list of organizations who are using KServe either in production, or providing integrations or deployment options with their Cloud or product offerings. If you'd like to be included here, please send a pull request which modifies this file. Please keep the list in alphabetical order. Organization Contact Advanced Micro Devices Varun Sharma Alauda Wu Yi Amazon Web Services Ellis Tarn Bloomberg Dan Sun Cars24 Swapnesh Khare Charmed Kubeflow from Canonical Daniela Plasencia Cisco Krishna Durai Cloudera Zoram Thanga CoreWeave Peter Salanki Deeploy Tim Kleinloog Gojek Willem Pienaar Halodoc ID Joinal Ahmed Hewlett Packard Enterprise (HPE) Jerry Harrow Hypermode Kevin Mingtarja IBM Nick Hill Inspur Qingshan Chen Intuit Rachit Chauhan Kubeflow on Google Cloud James Liu Max Kelsen Jacob O'Farrell Naver Mark Winter Nuance Jeff Griffith NVIDIA David Goodwin One Convergence Subra Ongole PITS Global Data Recovery Services Pheianox Red Hat Taneem Ibrahim Seldon Alex Housley Patterson Consulting Josh Patterson Samsung SDS Hanbae Seo Striveworks Jordan Yono Upstage JuHyung Son Zillow Peilun Li","title":"Adopters"},{"location":"community/adopters/#adopters-of-kserve","text":"This page contains a list of organizations who are using KServe either in production, or providing integrations or deployment options with their Cloud or product offerings. If you'd like to be included here, please send a pull request which modifies this file. Please keep the list in alphabetical order. Organization Contact Advanced Micro Devices Varun Sharma Alauda Wu Yi Amazon Web Services Ellis Tarn Bloomberg Dan Sun Cars24 Swapnesh Khare Charmed Kubeflow from Canonical Daniela Plasencia Cisco Krishna Durai Cloudera Zoram Thanga CoreWeave Peter Salanki Deeploy Tim Kleinloog Gojek Willem Pienaar Halodoc ID Joinal Ahmed Hewlett Packard Enterprise (HPE) Jerry Harrow Hypermode Kevin Mingtarja IBM Nick Hill Inspur Qingshan Chen Intuit Rachit Chauhan Kubeflow on Google Cloud James Liu Max Kelsen Jacob O'Farrell Naver Mark Winter Nuance Jeff Griffith NVIDIA David Goodwin One Convergence Subra Ongole PITS Global Data Recovery Services Pheianox Red Hat Taneem Ibrahim Seldon Alex Housley Patterson Consulting Josh Patterson Samsung SDS Hanbae Seo Striveworks Jordan Yono Upstage JuHyung Son Zillow Peilun Li","title":"Adopters of KServe"},{"location":"community/get_involved/","text":"How to Get Involved \u00b6 Welcome to the KServe community! Feel free to ask questions, engage in discussions, or get involved in the KServe's development. KServe, as an open-source project, thrives on the active participation of its community. Let's work together to make machine learning model serving effortless. Join us! How do you want to get involved? \u00b6 Ask Questions \u00b6 For the fastest response, you can ask questions on the #kserve channel of the CNCF Slack . To Join the channel, Create your CNCF Slack account and Search for the #kserve channel or join via this link . If you prefer to use GitHub discussions, you can join the KServe discussions . Bug Reports and Feature Requests \u00b6 We use GitHub Issues to track bug reports and feature requests. Please file your issues and feature requests in the KServe main repository . For Documentation related issues, please use the KServe Website repository . For Open Inference Protocol (V2) related issues and feature requests, please use Open Inference Protocol repository A good bug report should include: Description: Clearly state what you were trying to accomplish and what behavior you observed instead Versions: Specify the versions of relevant components KServe version Knative version (If using Serverless) Kubeflow version (If used with Kubeflow) Kubernetes version Cloud provider details (if using a cloud provider, indicate which one) Relevant resource yaml, HTTP requests, or log lines Vulnerability Reports \u00b6 We strongly encourage you to report security vulnerabilities privately, before disclosing them in any public forums. Only the active maintainers and KServe security group members will receive the reported security vulnerabilities and the issues are treated as top priority. You can use the following ways to report security vulnerabilities privately: Using our private security mailing list: kserve-security@lists.lfaidata.foundation . Using the KServe repository GitHub Security Advisory Become a Contributor \u00b6 This is the place to start your journey as a contributor\u2014whether it's enhancing code, improving documentation. KServe welcomes your contribution! If you're interested in becoming a KServe contributor, you'll want to check out our developer guide . Communication Channels \u00b6 Much of the community meets on the CNCF Slack , using the following channels: #kserve : General discussion about KServe usage #kserve-contributors : General discussion channel for folks contributing to the KServe project in any capacity #kserve-oip-collaboration : Discussion area for Open Inference Protocol and API standardization Community Meetings \u00b6 We have public KServe WG biweekly community meetings on Wed 9AM US/Pacific and a public monthly Open Inference Protocol WG meeting on Wed 10AM US/Pacific. KServe WG Meeting agendas and notes can be accessed in the working group document . Open Inference Protocol WG meeting minutes from the monthly work group sessions can be accessed in the working group document . You can access the meeting recordings on the community calendar by clicking on the respective date's event details. Stay tuned for new events by subscribing to the community calendar ( iCal export file ).","title":"How to Get Involved"},{"location":"community/get_involved/#how-to-get-involved","text":"Welcome to the KServe community! Feel free to ask questions, engage in discussions, or get involved in the KServe's development. KServe, as an open-source project, thrives on the active participation of its community. Let's work together to make machine learning model serving effortless. Join us!","title":"How to Get Involved"},{"location":"community/get_involved/#how-do-you-want-to-get-involved","text":"","title":"How do you want to get involved?"},{"location":"community/get_involved/#ask-questions","text":"For the fastest response, you can ask questions on the #kserve channel of the CNCF Slack . To Join the channel, Create your CNCF Slack account and Search for the #kserve channel or join via this link . If you prefer to use GitHub discussions, you can join the KServe discussions .","title":"Ask Questions"},{"location":"community/get_involved/#bug-reports-and-feature-requests","text":"We use GitHub Issues to track bug reports and feature requests. Please file your issues and feature requests in the KServe main repository . For Documentation related issues, please use the KServe Website repository . For Open Inference Protocol (V2) related issues and feature requests, please use Open Inference Protocol repository A good bug report should include: Description: Clearly state what you were trying to accomplish and what behavior you observed instead Versions: Specify the versions of relevant components KServe version Knative version (If using Serverless) Kubeflow version (If used with Kubeflow) Kubernetes version Cloud provider details (if using a cloud provider, indicate which one) Relevant resource yaml, HTTP requests, or log lines","title":"Bug Reports and Feature Requests"},{"location":"community/get_involved/#vulnerability-reports","text":"We strongly encourage you to report security vulnerabilities privately, before disclosing them in any public forums. Only the active maintainers and KServe security group members will receive the reported security vulnerabilities and the issues are treated as top priority. You can use the following ways to report security vulnerabilities privately: Using our private security mailing list: kserve-security@lists.lfaidata.foundation . Using the KServe repository GitHub Security Advisory","title":"Vulnerability Reports"},{"location":"community/get_involved/#become-a-contributor","text":"This is the place to start your journey as a contributor\u2014whether it's enhancing code, improving documentation. KServe welcomes your contribution! If you're interested in becoming a KServe contributor, you'll want to check out our developer guide .","title":"Become a Contributor"},{"location":"community/get_involved/#communication-channels","text":"Much of the community meets on the CNCF Slack , using the following channels: #kserve : General discussion about KServe usage #kserve-contributors : General discussion channel for folks contributing to the KServe project in any capacity #kserve-oip-collaboration : Discussion area for Open Inference Protocol and API standardization","title":"Communication Channels"},{"location":"community/get_involved/#community-meetings","text":"We have public KServe WG biweekly community meetings on Wed 9AM US/Pacific and a public monthly Open Inference Protocol WG meeting on Wed 10AM US/Pacific. KServe WG Meeting agendas and notes can be accessed in the working group document . Open Inference Protocol WG meeting minutes from the monthly work group sessions can be accessed in the working group document . You can access the meeting recordings on the community calendar by clicking on the respective date's event details. Stay tuned for new events by subscribing to the community calendar ( iCal export file ).","title":"Community Meetings"},{"location":"community/presentations/","text":"KServe(Formally KFServing) Presentations and Demoes \u00b6 This page contains a list of presentations and demos. If you'd like to add a presentation or demo here, please send a pull request. Presentation/Demo Presenters Optimizing Load Balancing and Autoscaling for Large Language Model (LLM) Inference on Kubernetes David Gray Engaging the KServe Community, The Impact of Integrating a Solutions with Standardized CNCF Projects Adam Tetelman, Taneem Ibrahim, Johnu George, Tessa Pham, Andreea Munteanu Advancing Cloud Native AI Innovation Through Open Collaboration Yuan Tang Unlocking Potential of Large Models in Production Yuan Tang, Adam Tetelman WG Serving: Accelerating AI/ML Inference Workloads on Kubernetes Yuan Tang, Eduardo Arango Gutierrez Best Practices for Deploying LLM Inference, RAG and Fine Tuning Pipelines Meenakshi Kaushik, Shiva Krishna Merla From Bash Scripts to Kubeflow and GitOps: Our Journey to Operationalizing ML at Scale Luca Grazioli, Dennis Ohrndorf Production-Ready AI Platform on Kubernetes Yuan Tang Fortifying AI Security in Kubernetes with Confidential Containers Suraj Deshmukh, Pradipta Banerjee Platform Building Blocks: How to Build ML Infrastructure with CNCF Projects Yuzhui Liu, Leon Zhou Distributed Machine Learning Patterns from Manning Publications Yuan Tang KubeCon 2019: Introducing KFServing: Serverless Model Serving on Kubernetes Dan Sun, Ellis Tarn KubeCon 2019: Advanced Model Inferencing Leveraging KNative, Istio & Kubeflow Serving Animesh Singh, Clive Cox KubeflowDojo: KFServing - Production Model Serving Platform Animesh Singh, Tommy Li NVIDIA: Accelerate and Autoscale Deep Learning Inference on GPUs with KFServing Dan Sun, David Goodwin KF Community: KFServing - Enabling Serverless Workloads Across Model Frameworks Ellis Tarn KubeflowDojo: Demo - KFServing End to End through Notebook Animesh Singh, Tommy Li KubeflowDojo: Demo - KFServing with Kafka and Kubeflow Pipelines Animesh Singh Anchor MLOps Podcast: Serving Models with KFServing David Aponte, Demetrios Brinkmann Kubeflow 101: What is KFServing? Stephanie Wong ICML 2020, Workshop on Challenges in Deploying and Monitoring Machine Learning Systems : Serverless inferencing on Kubernetes Clive Cox Serverless Practitioners Summit 2020: Serverless Machine Learning Inference with KFServing Clive Cox, Yuzhui Liu MLOps Meetup: KServe Live Coding Session Theofilos Papapanagiotou KubeCon AI Days 2021: Serving Machine Learning Models at Scale Using KServe Yuzhui Liu KubeCon 2021: Serving Machine Learning Models at Scale Using KServe Animesh Singh KubeCon China 2021: Accelerate Federated Learning Model Deployment with KServe Fangchi Wang & Jiahao Chen KubeCon AI Days 2022: Exploring ML Model Serving with KServe Alexa Nicole Griffith KubeCon AI Days 2022: Enhancing the Performance Testing Process for gRPC Model Inferencing at Scale Ted Chang, Paul Van Eck KubeCon Edge Days 2022: Model Serving at the Edge Made Easier Paul Van Eck KnativeCon 2022: How We Built an ML inference Platform with Knative Dan Sun KubeCon EU 2023: The state and future of cloud native model serving Dan Sun, Theofilos Papapanagiotou Kubeflow Summit 2023: Scale your Models to Zero with Knative and KServe Jooho Lee Kubeflow Summit 2023: What to choose? ModelMesh vs Model Serving? Vaibhav Jain","title":"Demos and Presentations"},{"location":"community/presentations/#kserveformally-kfserving-presentations-and-demoes","text":"This page contains a list of presentations and demos. If you'd like to add a presentation or demo here, please send a pull request. Presentation/Demo Presenters Optimizing Load Balancing and Autoscaling for Large Language Model (LLM) Inference on Kubernetes David Gray Engaging the KServe Community, The Impact of Integrating a Solutions with Standardized CNCF Projects Adam Tetelman, Taneem Ibrahim, Johnu George, Tessa Pham, Andreea Munteanu Advancing Cloud Native AI Innovation Through Open Collaboration Yuan Tang Unlocking Potential of Large Models in Production Yuan Tang, Adam Tetelman WG Serving: Accelerating AI/ML Inference Workloads on Kubernetes Yuan Tang, Eduardo Arango Gutierrez Best Practices for Deploying LLM Inference, RAG and Fine Tuning Pipelines Meenakshi Kaushik, Shiva Krishna Merla From Bash Scripts to Kubeflow and GitOps: Our Journey to Operationalizing ML at Scale Luca Grazioli, Dennis Ohrndorf Production-Ready AI Platform on Kubernetes Yuan Tang Fortifying AI Security in Kubernetes with Confidential Containers Suraj Deshmukh, Pradipta Banerjee Platform Building Blocks: How to Build ML Infrastructure with CNCF Projects Yuzhui Liu, Leon Zhou Distributed Machine Learning Patterns from Manning Publications Yuan Tang KubeCon 2019: Introducing KFServing: Serverless Model Serving on Kubernetes Dan Sun, Ellis Tarn KubeCon 2019: Advanced Model Inferencing Leveraging KNative, Istio & Kubeflow Serving Animesh Singh, Clive Cox KubeflowDojo: KFServing - Production Model Serving Platform Animesh Singh, Tommy Li NVIDIA: Accelerate and Autoscale Deep Learning Inference on GPUs with KFServing Dan Sun, David Goodwin KF Community: KFServing - Enabling Serverless Workloads Across Model Frameworks Ellis Tarn KubeflowDojo: Demo - KFServing End to End through Notebook Animesh Singh, Tommy Li KubeflowDojo: Demo - KFServing with Kafka and Kubeflow Pipelines Animesh Singh Anchor MLOps Podcast: Serving Models with KFServing David Aponte, Demetrios Brinkmann Kubeflow 101: What is KFServing? Stephanie Wong ICML 2020, Workshop on Challenges in Deploying and Monitoring Machine Learning Systems : Serverless inferencing on Kubernetes Clive Cox Serverless Practitioners Summit 2020: Serverless Machine Learning Inference with KFServing Clive Cox, Yuzhui Liu MLOps Meetup: KServe Live Coding Session Theofilos Papapanagiotou KubeCon AI Days 2021: Serving Machine Learning Models at Scale Using KServe Yuzhui Liu KubeCon 2021: Serving Machine Learning Models at Scale Using KServe Animesh Singh KubeCon China 2021: Accelerate Federated Learning Model Deployment with KServe Fangchi Wang & Jiahao Chen KubeCon AI Days 2022: Exploring ML Model Serving with KServe Alexa Nicole Griffith KubeCon AI Days 2022: Enhancing the Performance Testing Process for gRPC Model Inferencing at Scale Ted Chang, Paul Van Eck KubeCon Edge Days 2022: Model Serving at the Edge Made Easier Paul Van Eck KnativeCon 2022: How We Built an ML inference Platform with Knative Dan Sun KubeCon EU 2023: The state and future of cloud native model serving Dan Sun, Theofilos Papapanagiotou Kubeflow Summit 2023: Scale your Models to Zero with Knative and KServe Jooho Lee Kubeflow Summit 2023: What to choose? ModelMesh vs Model Serving? Vaibhav Jain","title":"KServe(Formally KFServing) Presentations and Demoes"},{"location":"developer/debug/","text":"KServe Debugging Guide \u00b6 Debug KServe InferenceService Status \u00b6 You deployed an InferenceService to KServe, but it is not in ready state. Go through this step by step guide to understand what failed. kubectl get inferenceservices sklearn-iris NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE model-example False 1m IngressNotConfigured \u00b6 If you see IngressNotConfigured error, this indicates Istio Ingress Gateway probes are failing. kubectl get ksvc NAME URL LATESTCREATED LATESTREADY READY REASON sklearn-iris-predictor-default http://sklearn-iris-predictor-default.default.example.com sklearn-iris-predictor-default-jk794 mnist-sample-predictor-default-jk794 Unknown IngressNotConfigured You can then check Knative networking-istio pod logs for more details. kubectl logs -l app = networking-istio -n knative-serving If you are seeing HTTP 403, then you may have Istio RBAC turned on which blocks the probes to your service. { \"level\" : \"error\" , \"ts\" : \"2020-03-26T19:12:00.749Z\" , \"logger\" : \"istiocontroller.ingress-controller.status-manager\" , \"caller\" : \"ingress/status.go:366\" , \"msg\" : \"Probing of http://flowers-sample-predictor-default.kubeflow-jeanarmel-luce.example.com:80/ failed, IP: 10.0.0.29:80, ready: false, error: unexpected status code: want [200], got 403 (depth: 0)\" , \"commit\" : \"6b0e5c6\" , \"knative.dev/controller\" : \"ingress-controller\" , \"stacktrace\" : \"knative.dev/serving/pkg/reconciler/ingress.(*StatusProber).processWorkItem\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:366\\nknative.dev/serving/pkg/reconciler/ingress.(*StatusProber).Start.func1\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:268\" } RevisionMissing Error \u00b6 If you see RevisionMissing error, then your service pods are not in ready state. Knative Service creates Knative Revision which represents a snapshot of the InferenceService code and configuration. Storage Initializer fails to download model \u00b6 kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-csjpw sklearn-iris-predictor-default sklearn-iris-predictor-default-csjpw 2 Unknown Deploying If you see READY status in Unknown error, this usually indicates that the KServe Storage Initializer init container fails to download the model and you can check the init container logs to see why it fails, note that the pod scales down after sometime if the init container fails . kubectl get pod -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-29jks-deployment-5f7d4b9996hzrnc 0 /3 Init:Error 1 10s kubectl logs -l model = sklearn-iris -c storage-initializer [ I 200517 03 :56:19 initializer-entrypoint:13 ] Initializing, args: src_uri [ gs://kfserving-examples/models/sklearn/iris-1 ] dest_path [ [ /mnt/models ] [ I 200517 03 :56:19 storage:35 ] Copying contents of gs://kfserving-examples/models/sklearn/iris-1 to local Traceback ( most recent call last ) : File \"/storage-initializer/scripts/initializer-entrypoint\" , line 14 , in <module> kserve.Storage.download ( src_uri, dest_path ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 48 , in download Storage._download_gcs ( uri, out_dir ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 116 , in _download_gcs The path or model %s does not exist. \" % (uri)) RuntimeError: Failed to fetch model. The path or model gs://kfserving-examples/models/sklearn/iris-1 does not exist. [I 200517 03:40:19 initializer-entrypoint:13] Initializing, args: src_uri [gs://kfserving-examples/models/sklearn/iris] dest_path[ [/mnt/models] [I 200517 03:40:19 storage:35] Copying contents of gs://kfserving-examples/models/sklearn/iris to local [I 200517 03:40:20 storage:111] Downloading: /mnt/models/model.joblib [I 200517 03:40:20 storage:60] Successfully copied gs://kfserving-examples/models/sklearn/iris to /mnt/models Inference Service in OOM status \u00b6 If you see ExitCode137 from the revision status, this means the revision has failed and this usually happens when the inference service pod is out of memory. To address it, you might need to bump up the memory limit of the InferenceService . kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-84bzf sklearn-iris-predictor-default sklearn-iris-predictor-default-84bzf 8 False ExitCode137s Inference Service fails to start \u00b6 If you see other exit codes from the revision status you can further check the pod status. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n 1 /3 CrashLoopBackOff 3 80s If you see the CrashLoopBackOff , then check the kserve-container log to see more details where it fails, the error log is usually propagated on revision container status also. kubectl logs sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n kserve-container [ I 200517 04 :58:21 storage:35 ] Copying contents of /mnt/models to local Traceback ( most recent call last ) : File \"/usr/local/lib/python3.7/runpy.py\" , line 193 , in _run_module_as_main \"__main__\" , mod_spec ) File \"/usr/local/lib/python3.7/runpy.py\" , line 85 , in _run_code exec ( code, run_globals ) File \"/sklearnserver/sklearnserver/__main__.py\" , line 33 , in <module> model.load () File \"/sklearnserver/sklearnserver/model.py\" , line 36 , in load model_file = next ( path for path in paths if os.path.exists ( path )) StopIteration Inference Service cannot fetch docker images from AWS ECR \u00b6 If you don't see the inference service created at all for custom images from private registries (such as AWS ECR), it might be that the Knative Serving Controller fails to authenticate itself against the registry. failed to resolve image to digest: failed to fetch image information: unsupported status code 401 ; body: Not Authorized You can verify that this is actually the case by spinning up a pod that uses your image. The pod should be able to fetch it, if the correct IAM roles are attached, while Knative is not able to. To circumvent this issue you can either skip tag resolution or provide certificates for your registry as detailed in the official knative docs . kubectl -n knative-serving edit configmap config-deployment The resultant yaml will look like something below. apiVersion : v1 kind : ConfigMap metadata : name : config-deployment namespace : knative-serving data : # List of repositories for which tag to digest resolving should be skipped (for AWS ECR: {account_id}.dkr.ecr.{region}.amazonaws.com) registriesSkippingTagResolving : registry.example.com Debug KServe Request flow \u00b6 +----------------------+ +-----------------------+ +--------------------------+ |Istio Virtual Service | |Istio Virtual Service | | K8S Service | | | | | | | |sklearn-iris | |sklearn-iris-predictor | | sklearn-iris-predictor | | +------->|-default +----->| -default-$revision | | | | | | | |KServe Route | |Knative Route | | Knative Revision Service | +----------------------+ +-----------------------+ +------------+-------------+ Knative Ingress Gateway Knative Local Gateway Kube Proxy (Istio gateway) (Istio gateway) | | | +-------------------------------------------------------+ | | Knative Revision Pod | | | | | | +-------------------+ +-----------------+ | | | | | | | | | | |kserve-container |<-----+ Queue Proxy | |<------------------+ | | | | | | | +-------------------+ +--------------^--+ | | | | +-----------------------^-------------------------------+ | scale deployment | +--------+--------+ | pull metrics | Knative | | | Autoscaler |----------- | KPA/HPA | +-----------------+ 1.Traffic arrives through Knative Ingress/Local Gateway for external/internal traffic \u00b6 Istio Gateway resource describes the edge of the mesh receiving incoming or outgoing HTTP/TCP connections. The specification describes a set of ports that should be exposed and the type of protocol to use. If you are using Standalone mode, it installs the Gateway in knative-serving namespace, if you are using Kubeflow KServe (KServe installed with Kubeflow), it installs the Gateway in kubeflow namespace e.g on GCP the gateway is protected behind IAP with Istio authentication policy . kubectl get gateway knative-ingress-gateway -n knative-serving -oyaml kind : Gateway metadata : labels : networking.knative.dev/ingress-provider : istio serving.knative.dev/release : v0.12.1 name : knative-ingress-gateway namespace : knative-serving spec : selector : istio : ingressgateway servers : - hosts : - '*' port : name : http number : 80 protocol : HTTP - hosts : - '*' port : name : https number : 443 protocol : HTTPS tls : mode : SIMPLE privateKey : /etc/istio/ingressgateway-certs/tls.key serverCertificate : /etc/istio/ingressgateway-certs/tls.crt The InferenceService request routes to the Istio Ingress Gateway by matching the host and port from the url, by default http is configured, you can configure HTTPS with TLS certificates . 2. KServe Istio virtual service to route for predictor, transformer, explainer. \u00b6 kubectl get vs sklearn-iris -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris namespace : default gateways : - knative-serving/knative-local-gateway - knative-serving/knative-ingress-gateway hosts : - sklearn-iris.default.svc.cluster.local - sklearn-iris.default.example.com http : - headers : request : set : Host : sklearn-iris-predictor-default.default.svc.cluster.local match : - authority : regex : ^sklearn-iris\\.default(\\.svc(\\.cluster\\.local)?)?(?::\\d{1,5})?$ gateways : - knative-serving/knative-local-gateway - authority : regex : ^sklearn-iris\\.default\\.example\\.com(?::\\d{1,5})?$ gateways : - knative-serving/knative-ingress-gateway route : - destination : host : knative-local-gateway.istio-system.svc.cluster.local port : number : 80 weight : 100 KServe creates the routing rule which by default routes to Predictor if you only have Predictor specified on InferenceService . When Transformer and Explainer are specified on InferenceService the routing rule configures the traffic to route to Transformer or Explainer based on the verb. The request then routes to the second level Knative created virtual service via local gateway with the matching host header. 3. Knative Istio virtual service to route the inference request to the latest ready revision. \u00b6 kubectl get vs sklearn-iris-predictor-default-ingress -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris-predictor-default-mesh namespace : default spec : gateways : - knative-serving/knative-ingress-gateway - knative-serving/knative-local-gateway hosts : - sklearn-iris-predictor-default.default - sklearn-iris-predictor-default.default.example.com - sklearn-iris-predictor-default.default.svc - sklearn-iris-predictor-default.default.svc.cluster.local http : - match : - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default.svc gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 - match : - authority : prefix : sklearn-iris-predictor-default.default.example.com gateways : - knative-serving/knative-ingress-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 The destination here is the k8s Service for the latest ready Knative Revision and it is reconciled by Knative every time user rolls out a new revision. When a new revision is rolled out and in ready state, the old revision is then scaled down, after configured revision GC time the revision resource is garbage collected if the revision no longer has traffic referenced. 4. Kubernetes Service routes the requests to the queue proxy sidecar of the inference service pod on port 8012 . \u00b6 kubectl get svc sklearn-iris-predictor-default-fhmjk-private -oyaml apiVersion : v1 kind : Service metadata : name : sklearn-iris-predictor-default-fhmjk-private namespace : default spec : clusterIP : 10.105.186.18 ports : - name : http port : 80 protocol : TCP targetPort : 8012 - name : queue-metrics port : 9090 protocol : TCP targetPort : queue-metrics - name : http-usermetric port : 9091 protocol : TCP targetPort : http-usermetric - name : http-queueadm port : 8022 protocol : TCP targetPort : 8022 selector : serving.knative.dev/revisionUID : a8f1eafc-3c64-4930-9a01-359f3235333a sessionAffinity : None type : ClusterIP 5. The queue proxy routes to kserve container with max concurrent requests configured with ContainerConcurrency . \u00b6 If the queue proxy has more requests than it can handle, the Knative Autoscaler creates more pods to handle additional requests. 6. Finally The queue proxy routes traffic to the kserve-container for processing the inference requests. \u00b6","title":"Debugging guide"},{"location":"developer/debug/#kserve-debugging-guide","text":"","title":"KServe Debugging Guide"},{"location":"developer/debug/#debug-kserve-inferenceservice-status","text":"You deployed an InferenceService to KServe, but it is not in ready state. Go through this step by step guide to understand what failed. kubectl get inferenceservices sklearn-iris NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE model-example False 1m","title":"Debug KServe InferenceService Status"},{"location":"developer/debug/#ingressnotconfigured","text":"If you see IngressNotConfigured error, this indicates Istio Ingress Gateway probes are failing. kubectl get ksvc NAME URL LATESTCREATED LATESTREADY READY REASON sklearn-iris-predictor-default http://sklearn-iris-predictor-default.default.example.com sklearn-iris-predictor-default-jk794 mnist-sample-predictor-default-jk794 Unknown IngressNotConfigured You can then check Knative networking-istio pod logs for more details. kubectl logs -l app = networking-istio -n knative-serving If you are seeing HTTP 403, then you may have Istio RBAC turned on which blocks the probes to your service. { \"level\" : \"error\" , \"ts\" : \"2020-03-26T19:12:00.749Z\" , \"logger\" : \"istiocontroller.ingress-controller.status-manager\" , \"caller\" : \"ingress/status.go:366\" , \"msg\" : \"Probing of http://flowers-sample-predictor-default.kubeflow-jeanarmel-luce.example.com:80/ failed, IP: 10.0.0.29:80, ready: false, error: unexpected status code: want [200], got 403 (depth: 0)\" , \"commit\" : \"6b0e5c6\" , \"knative.dev/controller\" : \"ingress-controller\" , \"stacktrace\" : \"knative.dev/serving/pkg/reconciler/ingress.(*StatusProber).processWorkItem\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:366\\nknative.dev/serving/pkg/reconciler/ingress.(*StatusProber).Start.func1\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:268\" }","title":"IngressNotConfigured"},{"location":"developer/debug/#revisionmissing-error","text":"If you see RevisionMissing error, then your service pods are not in ready state. Knative Service creates Knative Revision which represents a snapshot of the InferenceService code and configuration.","title":"RevisionMissing Error"},{"location":"developer/debug/#storage-initializer-fails-to-download-model","text":"kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-csjpw sklearn-iris-predictor-default sklearn-iris-predictor-default-csjpw 2 Unknown Deploying If you see READY status in Unknown error, this usually indicates that the KServe Storage Initializer init container fails to download the model and you can check the init container logs to see why it fails, note that the pod scales down after sometime if the init container fails . kubectl get pod -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-29jks-deployment-5f7d4b9996hzrnc 0 /3 Init:Error 1 10s kubectl logs -l model = sklearn-iris -c storage-initializer [ I 200517 03 :56:19 initializer-entrypoint:13 ] Initializing, args: src_uri [ gs://kfserving-examples/models/sklearn/iris-1 ] dest_path [ [ /mnt/models ] [ I 200517 03 :56:19 storage:35 ] Copying contents of gs://kfserving-examples/models/sklearn/iris-1 to local Traceback ( most recent call last ) : File \"/storage-initializer/scripts/initializer-entrypoint\" , line 14 , in <module> kserve.Storage.download ( src_uri, dest_path ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 48 , in download Storage._download_gcs ( uri, out_dir ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 116 , in _download_gcs The path or model %s does not exist. \" % (uri)) RuntimeError: Failed to fetch model. The path or model gs://kfserving-examples/models/sklearn/iris-1 does not exist. [I 200517 03:40:19 initializer-entrypoint:13] Initializing, args: src_uri [gs://kfserving-examples/models/sklearn/iris] dest_path[ [/mnt/models] [I 200517 03:40:19 storage:35] Copying contents of gs://kfserving-examples/models/sklearn/iris to local [I 200517 03:40:20 storage:111] Downloading: /mnt/models/model.joblib [I 200517 03:40:20 storage:60] Successfully copied gs://kfserving-examples/models/sklearn/iris to /mnt/models","title":"Storage Initializer fails to download model"},{"location":"developer/debug/#inference-service-in-oom-status","text":"If you see ExitCode137 from the revision status, this means the revision has failed and this usually happens when the inference service pod is out of memory. To address it, you might need to bump up the memory limit of the InferenceService . kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-84bzf sklearn-iris-predictor-default sklearn-iris-predictor-default-84bzf 8 False ExitCode137s","title":"Inference Service in OOM status"},{"location":"developer/debug/#inference-service-fails-to-start","text":"If you see other exit codes from the revision status you can further check the pod status. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n 1 /3 CrashLoopBackOff 3 80s If you see the CrashLoopBackOff , then check the kserve-container log to see more details where it fails, the error log is usually propagated on revision container status also. kubectl logs sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n kserve-container [ I 200517 04 :58:21 storage:35 ] Copying contents of /mnt/models to local Traceback ( most recent call last ) : File \"/usr/local/lib/python3.7/runpy.py\" , line 193 , in _run_module_as_main \"__main__\" , mod_spec ) File \"/usr/local/lib/python3.7/runpy.py\" , line 85 , in _run_code exec ( code, run_globals ) File \"/sklearnserver/sklearnserver/__main__.py\" , line 33 , in <module> model.load () File \"/sklearnserver/sklearnserver/model.py\" , line 36 , in load model_file = next ( path for path in paths if os.path.exists ( path )) StopIteration","title":"Inference Service fails to start"},{"location":"developer/debug/#inference-service-cannot-fetch-docker-images-from-aws-ecr","text":"If you don't see the inference service created at all for custom images from private registries (such as AWS ECR), it might be that the Knative Serving Controller fails to authenticate itself against the registry. failed to resolve image to digest: failed to fetch image information: unsupported status code 401 ; body: Not Authorized You can verify that this is actually the case by spinning up a pod that uses your image. The pod should be able to fetch it, if the correct IAM roles are attached, while Knative is not able to. To circumvent this issue you can either skip tag resolution or provide certificates for your registry as detailed in the official knative docs . kubectl -n knative-serving edit configmap config-deployment The resultant yaml will look like something below. apiVersion : v1 kind : ConfigMap metadata : name : config-deployment namespace : knative-serving data : # List of repositories for which tag to digest resolving should be skipped (for AWS ECR: {account_id}.dkr.ecr.{region}.amazonaws.com) registriesSkippingTagResolving : registry.example.com","title":"Inference Service cannot fetch docker images from AWS ECR"},{"location":"developer/debug/#debug-kserve-request-flow","text":"+----------------------+ +-----------------------+ +--------------------------+ |Istio Virtual Service | |Istio Virtual Service | | K8S Service | | | | | | | |sklearn-iris | |sklearn-iris-predictor | | sklearn-iris-predictor | | +------->|-default +----->| -default-$revision | | | | | | | |KServe Route | |Knative Route | | Knative Revision Service | +----------------------+ +-----------------------+ +------------+-------------+ Knative Ingress Gateway Knative Local Gateway Kube Proxy (Istio gateway) (Istio gateway) | | | +-------------------------------------------------------+ | | Knative Revision Pod | | | | | | +-------------------+ +-----------------+ | | | | | | | | | | |kserve-container |<-----+ Queue Proxy | |<------------------+ | | | | | | | +-------------------+ +--------------^--+ | | | | +-----------------------^-------------------------------+ | scale deployment | +--------+--------+ | pull metrics | Knative | | | Autoscaler |----------- | KPA/HPA | +-----------------+","title":"Debug KServe Request flow"},{"location":"developer/debug/#1traffic-arrives-through-knative-ingresslocal-gateway-for-externalinternal-traffic","text":"Istio Gateway resource describes the edge of the mesh receiving incoming or outgoing HTTP/TCP connections. The specification describes a set of ports that should be exposed and the type of protocol to use. If you are using Standalone mode, it installs the Gateway in knative-serving namespace, if you are using Kubeflow KServe (KServe installed with Kubeflow), it installs the Gateway in kubeflow namespace e.g on GCP the gateway is protected behind IAP with Istio authentication policy . kubectl get gateway knative-ingress-gateway -n knative-serving -oyaml kind : Gateway metadata : labels : networking.knative.dev/ingress-provider : istio serving.knative.dev/release : v0.12.1 name : knative-ingress-gateway namespace : knative-serving spec : selector : istio : ingressgateway servers : - hosts : - '*' port : name : http number : 80 protocol : HTTP - hosts : - '*' port : name : https number : 443 protocol : HTTPS tls : mode : SIMPLE privateKey : /etc/istio/ingressgateway-certs/tls.key serverCertificate : /etc/istio/ingressgateway-certs/tls.crt The InferenceService request routes to the Istio Ingress Gateway by matching the host and port from the url, by default http is configured, you can configure HTTPS with TLS certificates .","title":"1.Traffic arrives through Knative Ingress/Local Gateway for external/internal traffic"},{"location":"developer/debug/#2-kserve-istio-virtual-service-to-route-for-predictor-transformer-explainer","text":"kubectl get vs sklearn-iris -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris namespace : default gateways : - knative-serving/knative-local-gateway - knative-serving/knative-ingress-gateway hosts : - sklearn-iris.default.svc.cluster.local - sklearn-iris.default.example.com http : - headers : request : set : Host : sklearn-iris-predictor-default.default.svc.cluster.local match : - authority : regex : ^sklearn-iris\\.default(\\.svc(\\.cluster\\.local)?)?(?::\\d{1,5})?$ gateways : - knative-serving/knative-local-gateway - authority : regex : ^sklearn-iris\\.default\\.example\\.com(?::\\d{1,5})?$ gateways : - knative-serving/knative-ingress-gateway route : - destination : host : knative-local-gateway.istio-system.svc.cluster.local port : number : 80 weight : 100 KServe creates the routing rule which by default routes to Predictor if you only have Predictor specified on InferenceService . When Transformer and Explainer are specified on InferenceService the routing rule configures the traffic to route to Transformer or Explainer based on the verb. The request then routes to the second level Knative created virtual service via local gateway with the matching host header.","title":"2. KServe Istio virtual service to route for predictor, transformer, explainer."},{"location":"developer/debug/#3-knative-istio-virtual-service-to-route-the-inference-request-to-the-latest-ready-revision","text":"kubectl get vs sklearn-iris-predictor-default-ingress -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris-predictor-default-mesh namespace : default spec : gateways : - knative-serving/knative-ingress-gateway - knative-serving/knative-local-gateway hosts : - sklearn-iris-predictor-default.default - sklearn-iris-predictor-default.default.example.com - sklearn-iris-predictor-default.default.svc - sklearn-iris-predictor-default.default.svc.cluster.local http : - match : - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default.svc gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 - match : - authority : prefix : sklearn-iris-predictor-default.default.example.com gateways : - knative-serving/knative-ingress-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 The destination here is the k8s Service for the latest ready Knative Revision and it is reconciled by Knative every time user rolls out a new revision. When a new revision is rolled out and in ready state, the old revision is then scaled down, after configured revision GC time the revision resource is garbage collected if the revision no longer has traffic referenced.","title":"3. Knative Istio virtual service to route the inference request to the latest ready revision."},{"location":"developer/debug/#4-kubernetes-service-routes-the-requests-to-the-queue-proxy-sidecar-of-the-inference-service-pod-on-port-8012","text":"kubectl get svc sklearn-iris-predictor-default-fhmjk-private -oyaml apiVersion : v1 kind : Service metadata : name : sklearn-iris-predictor-default-fhmjk-private namespace : default spec : clusterIP : 10.105.186.18 ports : - name : http port : 80 protocol : TCP targetPort : 8012 - name : queue-metrics port : 9090 protocol : TCP targetPort : queue-metrics - name : http-usermetric port : 9091 protocol : TCP targetPort : http-usermetric - name : http-queueadm port : 8022 protocol : TCP targetPort : 8022 selector : serving.knative.dev/revisionUID : a8f1eafc-3c64-4930-9a01-359f3235333a sessionAffinity : None type : ClusterIP","title":"4. Kubernetes Service routes the requests to the queue proxy sidecar of the inference service pod on port 8012."},{"location":"developer/debug/#5-the-queue-proxy-routes-to-kserve-container-with-max-concurrent-requests-configured-with-containerconcurrency","text":"If the queue proxy has more requests than it can handle, the Knative Autoscaler creates more pods to handle additional requests.","title":"5. The queue proxy routes to kserve container with max concurrent requests configured with ContainerConcurrency."},{"location":"developer/debug/#6-finally-the-queue-proxy-routes-traffic-to-the-kserve-container-for-processing-the-inference-requests","text":"","title":"6. Finally The queue proxy routes traffic to the kserve-container for processing the inference requests."},{"location":"developer/developer/","text":"Development \u00b6 This doc explains how to setup a development environment so you can get started contributing . Prerequisites \u00b6 Follow the instructions below to set up your development environment. Once you meet these requirements, you can make changes and deploy your own version of kserve ! Before submitting a PR, see also CONTRIBUTING.md . Install requirements \u00b6 You must install these tools: go : KServe controller is written in Go and requires Go 1.20.0+. git : For source control. Go Module : Go's new dependency management system. ko : For development. kubectl : For managing development environments. kustomize To customize YAMLs for different environments, requires v5.0.0+. yq yq is used in the project makefiles to parse and display YAML output, requires yq 4.* . Install Knative on a Kubernetes cluster \u00b6 KServe currently requires Knative Serving for auto-scaling, canary rollout, Istio for traffic routing and ingress. To install Knative components on your Kubernetes cluster, follow the installation guide or alternatively, use the Knative Operators to manage your installation. Observability, tracing and logging are optional but are often very valuable tools for troubleshooting difficult issues, they can be installed via the directions here . If you start from scratch, KServe requires Kubernetes 1.25+, Knative 1.7+, Istio 1.15+. If you already have Istio or Knative (e.g. from a Kubeflow install) then you don't need to install them explicitly, as long as version dependencies are satisfied. Note On a local environment, when using minikube or kind as Kubernetes cluster, there has been a reported issue that knative quickstart bootstrap does not work as expected. It is recommended to follow the installation manual from knative using yaml or using knative operator for a better result. Setup your environment \u00b6 To start your environment you'll need to set these environment variables (we recommend adding them to your .bashrc ): GOPATH : If you don't have one, simply pick a directory and add export GOPATH=... $GOPATH/bin on PATH : This is so that tooling installed via go get will work properly. KO_DEFAULTPLATFORMS : If you are using M1 Mac book the value is linux/arm64 . KO_DOCKER_REPO : The docker repository to which developer images should be pushed (e.g. docker.io/<username> ). Note : Set up a docker repository for pushing images. You can use any container image registry by adjusting the authentication methods and repository paths mentioned in the sections below. Google Container Registry quickstart Docker Hub quickstart Azure Container Registry quickstart Note if you are using docker hub to store your images your KO_DOCKER_REPO variable should be docker.io/<username> . Currently Docker Hub doesn't let you create subdirs under your username. .bashrc example: export GOPATH = \" $HOME /go\" export PATH = \" ${ PATH } : ${ GOPATH } /bin\" export KO_DOCKER_REPO = 'docker.io/<username>' Checkout your fork \u00b6 The Go tools require that you clone the repository to the src/github.com/kserve/kserve directory in your GOPATH . To check out this repository: Create your own fork of this repo Clone it to your machine: mkdir -p ${ GOPATH } /src/github.com/kserve cd ${ GOPATH } /src/github.com/kserve git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /kserve.git cd kserve git remote add upstream git@github.com:kserve/kserve.git git remote set-url --push upstream no_push Adding the upstream remote sets you up nicely for regularly syncing your fork . Once you reach this point you are ready to do a full build and deploy as described below. Deploy KServe \u00b6 Check Knative Serving installation \u00b6 Once you've setup your development environment , you can verify the installation with following: Success $ kubectl -n knative-serving get pods NAME READY STATUS RESTARTS AGE activator-77784645fc-t2pjf 1 /1 Running 0 11d autoscaler-6fddf74d5-z2fzf 1 /1 Running 0 11d autoscaler-hpa-5bf4476cc5-tsbw6 1 /1 Running 0 11d controller-7b8cd7f95c-6jxxj 1 /1 Running 0 11d istio-webhook-866c5bc7f8-t5ztb 1 /1 Running 0 11d networking-istio-54fb8b5d4b-xznwd 1 /1 Running 0 11d webhook-5f5f7bd9b4-cv27c 1 /1 Running 0 11d $ kubectl get gateway -n knative-serving NAME AGE knative-ingress-gateway 11d knative-local-gateway 11d $ kubectl get svc -n istio-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .101.196.89 X.X.X.X 15021 :31101/TCP,80:31781/TCP,443:30372/TCP,15443:31067/TCP 11d istiod ClusterIP 10 .101.116.203 <none> 15010 /TCP,15012/TCP,443/TCP,15014/TCP,853/TCP 11d Deploy KServe from master branch \u00b6 We suggest using cert manager for provisioning the certificates for the webhook server. Other solutions should also work as long as they put the certificates in the desired location. You can follow the cert manager documentation to install it. If you don't want to install cert manager, you can set the KSERVE_ENABLE_SELF_SIGNED_CA environment variable to true. KSERVE_ENABLE_SELF_SIGNED_CA will execute a script to create a self-signed CA and patch it to the webhook config. export KSERVE_ENABLE_SELF_SIGNED_CA = true After that you can run following command to deploy KServe , you can skip above step if cert manager is already installed. make deploy Optional you can change CPU and memory limits when deploying KServe . export KSERVE_CONTROLLER_CPU_LIMIT = <cpu_limit> export KSERVE_CONTROLLER_MEMORY_LIMIT = <memory_limit> make deploy Expected Output $ kubectl get pods -n kserve -l control-plane = kserve-controller-manager NAME READY STATUS RESTARTS AGE kserve-controller-manager-0 2/2 Running 0 13m Note By default it installs to kserve namespace with the published controller manager image from master branch. Deploy KServe with your own version \u00b6 Run the following command to deploy KServe controller and model agent with your local change. make deploy-dev Note deploy-dev builds the image from your local code, publishes to KO_DOCKER_REPO and deploys the kserve-controller-manager and model agent with the image digest to your cluster for testing. Please also ensure you are logged in to KO_DOCKER_REPO from your client machine. Run the following command to deploy model server with your local change. make deploy-dev-sklearn make deploy-dev-xgb Run the following command to deploy explainer with your local change. make deploy-dev-alibi Run the following command to deploy storage initializer with your local change. make deploy-dev-storageInitializer Warning The deploy command publishes the image to KO_DOCKER_REPO with the version latest , it changes the InferenceService configmap to point to the newly built image sha. The built image is only for development and testing purpose, the current limitation is that it changes the image impacted and reset all other images including the kserver-controller-manager to use the default ones. Smoke test after deployment \u00b6 Run the following command to smoke test the deployment kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/tensorflow/tensorflow.yaml You should see model serving deployment running under default or your specified namespace. $ kubectl get pods -n default -l serving.kserve.io/inferenceservice=flower-sample Expected Output NAME READY STATUS RESTARTS AGE flower-sample-default-htz8r-deployment-8fd979f9b-w2qbv 3/3 Running 0 10s Running unit/integration tests \u00b6 kserver-controller-manager has a few integration tests which requires mock apiserver and etcd, they get installed along with kubebuilder . To run all unit/integration tests: make test Run e2e tests locally \u00b6 To setup from local code, do: ./hack/quick_install.sh make undeploy make deploy-dev Go to python/kserve and install kserve python sdk deps pip3 install -e . [ test ] Then go to test/e2e . Run kubectl create namespace kserve-ci-e2e-test For KIND/minikube: Run export KSERVE_INGRESS_HOST_PORT=localhost:8080 In a different window run kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80 Note that not all tests will pass as the pytorch test requires gpu. These will show as pending pods at the end or you can add marker to skip the test. Run pytest > testresults.txt Tests may not clean up. To re-run, first do kubectl delete namespace kserve-ci-e2e-test , recreate namespace and run again. Iterating \u00b6 As you make changes to the code-base, there are two special cases to be aware of: If you change an input to generated code , then you must run make manifests . Inputs include: API type definitions in apis/serving Manifests or kustomize patches stored in config . To generate the KServe python/go clients, you should run make generate . If you want to add new dependencies , then you add the imports and the specific version of the dependency module in go.mod . When it encounters an import of a package not provided by any module in go.mod , the go command automatically looks up the module containing the package and adds it to go.mod using the latest version. If you want to upgrade the dependency , then you run go get command e.g go get golang.org/x/text to upgrade to the latest version, go get golang.org/x/text@v0.3.0 to upgrade to a specific version. make deploy-dev Contribute to the code \u00b6 See the guidelines for contributing a feature contributing to an existing issue Releases \u00b6 Please check out the documentation here to understand the release schedule and process. Feedback \u00b6 The best place to provide feedback about the KServe code is via a Github issue. See creating a Github issue for guidelines on submitting bugs and feature requests.","title":"How to contribute"},{"location":"developer/developer/#development","text":"This doc explains how to setup a development environment so you can get started contributing .","title":"Development"},{"location":"developer/developer/#prerequisites","text":"Follow the instructions below to set up your development environment. Once you meet these requirements, you can make changes and deploy your own version of kserve ! Before submitting a PR, see also CONTRIBUTING.md .","title":"Prerequisites"},{"location":"developer/developer/#install-requirements","text":"You must install these tools: go : KServe controller is written in Go and requires Go 1.20.0+. git : For source control. Go Module : Go's new dependency management system. ko : For development. kubectl : For managing development environments. kustomize To customize YAMLs for different environments, requires v5.0.0+. yq yq is used in the project makefiles to parse and display YAML output, requires yq 4.* .","title":"Install requirements"},{"location":"developer/developer/#install-knative-on-a-kubernetes-cluster","text":"KServe currently requires Knative Serving for auto-scaling, canary rollout, Istio for traffic routing and ingress. To install Knative components on your Kubernetes cluster, follow the installation guide or alternatively, use the Knative Operators to manage your installation. Observability, tracing and logging are optional but are often very valuable tools for troubleshooting difficult issues, they can be installed via the directions here . If you start from scratch, KServe requires Kubernetes 1.25+, Knative 1.7+, Istio 1.15+. If you already have Istio or Knative (e.g. from a Kubeflow install) then you don't need to install them explicitly, as long as version dependencies are satisfied. Note On a local environment, when using minikube or kind as Kubernetes cluster, there has been a reported issue that knative quickstart bootstrap does not work as expected. It is recommended to follow the installation manual from knative using yaml or using knative operator for a better result.","title":"Install Knative on a Kubernetes cluster"},{"location":"developer/developer/#setup-your-environment","text":"To start your environment you'll need to set these environment variables (we recommend adding them to your .bashrc ): GOPATH : If you don't have one, simply pick a directory and add export GOPATH=... $GOPATH/bin on PATH : This is so that tooling installed via go get will work properly. KO_DEFAULTPLATFORMS : If you are using M1 Mac book the value is linux/arm64 . KO_DOCKER_REPO : The docker repository to which developer images should be pushed (e.g. docker.io/<username> ). Note : Set up a docker repository for pushing images. You can use any container image registry by adjusting the authentication methods and repository paths mentioned in the sections below. Google Container Registry quickstart Docker Hub quickstart Azure Container Registry quickstart Note if you are using docker hub to store your images your KO_DOCKER_REPO variable should be docker.io/<username> . Currently Docker Hub doesn't let you create subdirs under your username. .bashrc example: export GOPATH = \" $HOME /go\" export PATH = \" ${ PATH } : ${ GOPATH } /bin\" export KO_DOCKER_REPO = 'docker.io/<username>'","title":"Setup your environment"},{"location":"developer/developer/#checkout-your-fork","text":"The Go tools require that you clone the repository to the src/github.com/kserve/kserve directory in your GOPATH . To check out this repository: Create your own fork of this repo Clone it to your machine: mkdir -p ${ GOPATH } /src/github.com/kserve cd ${ GOPATH } /src/github.com/kserve git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /kserve.git cd kserve git remote add upstream git@github.com:kserve/kserve.git git remote set-url --push upstream no_push Adding the upstream remote sets you up nicely for regularly syncing your fork . Once you reach this point you are ready to do a full build and deploy as described below.","title":"Checkout your fork"},{"location":"developer/developer/#deploy-kserve","text":"","title":"Deploy KServe"},{"location":"developer/developer/#check-knative-serving-installation","text":"Once you've setup your development environment , you can verify the installation with following: Success $ kubectl -n knative-serving get pods NAME READY STATUS RESTARTS AGE activator-77784645fc-t2pjf 1 /1 Running 0 11d autoscaler-6fddf74d5-z2fzf 1 /1 Running 0 11d autoscaler-hpa-5bf4476cc5-tsbw6 1 /1 Running 0 11d controller-7b8cd7f95c-6jxxj 1 /1 Running 0 11d istio-webhook-866c5bc7f8-t5ztb 1 /1 Running 0 11d networking-istio-54fb8b5d4b-xznwd 1 /1 Running 0 11d webhook-5f5f7bd9b4-cv27c 1 /1 Running 0 11d $ kubectl get gateway -n knative-serving NAME AGE knative-ingress-gateway 11d knative-local-gateway 11d $ kubectl get svc -n istio-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .101.196.89 X.X.X.X 15021 :31101/TCP,80:31781/TCP,443:30372/TCP,15443:31067/TCP 11d istiod ClusterIP 10 .101.116.203 <none> 15010 /TCP,15012/TCP,443/TCP,15014/TCP,853/TCP 11d","title":"Check Knative Serving installation"},{"location":"developer/developer/#deploy-kserve-from-master-branch","text":"We suggest using cert manager for provisioning the certificates for the webhook server. Other solutions should also work as long as they put the certificates in the desired location. You can follow the cert manager documentation to install it. If you don't want to install cert manager, you can set the KSERVE_ENABLE_SELF_SIGNED_CA environment variable to true. KSERVE_ENABLE_SELF_SIGNED_CA will execute a script to create a self-signed CA and patch it to the webhook config. export KSERVE_ENABLE_SELF_SIGNED_CA = true After that you can run following command to deploy KServe , you can skip above step if cert manager is already installed. make deploy Optional you can change CPU and memory limits when deploying KServe . export KSERVE_CONTROLLER_CPU_LIMIT = <cpu_limit> export KSERVE_CONTROLLER_MEMORY_LIMIT = <memory_limit> make deploy Expected Output $ kubectl get pods -n kserve -l control-plane = kserve-controller-manager NAME READY STATUS RESTARTS AGE kserve-controller-manager-0 2/2 Running 0 13m Note By default it installs to kserve namespace with the published controller manager image from master branch.","title":"Deploy KServe from master branch"},{"location":"developer/developer/#deploy-kserve-with-your-own-version","text":"Run the following command to deploy KServe controller and model agent with your local change. make deploy-dev Note deploy-dev builds the image from your local code, publishes to KO_DOCKER_REPO and deploys the kserve-controller-manager and model agent with the image digest to your cluster for testing. Please also ensure you are logged in to KO_DOCKER_REPO from your client machine. Run the following command to deploy model server with your local change. make deploy-dev-sklearn make deploy-dev-xgb Run the following command to deploy explainer with your local change. make deploy-dev-alibi Run the following command to deploy storage initializer with your local change. make deploy-dev-storageInitializer Warning The deploy command publishes the image to KO_DOCKER_REPO with the version latest , it changes the InferenceService configmap to point to the newly built image sha. The built image is only for development and testing purpose, the current limitation is that it changes the image impacted and reset all other images including the kserver-controller-manager to use the default ones.","title":"Deploy KServe with your own version"},{"location":"developer/developer/#smoke-test-after-deployment","text":"Run the following command to smoke test the deployment kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/tensorflow/tensorflow.yaml You should see model serving deployment running under default or your specified namespace. $ kubectl get pods -n default -l serving.kserve.io/inferenceservice=flower-sample Expected Output NAME READY STATUS RESTARTS AGE flower-sample-default-htz8r-deployment-8fd979f9b-w2qbv 3/3 Running 0 10s","title":"Smoke test after deployment"},{"location":"developer/developer/#running-unitintegration-tests","text":"kserver-controller-manager has a few integration tests which requires mock apiserver and etcd, they get installed along with kubebuilder . To run all unit/integration tests: make test","title":"Running unit/integration tests"},{"location":"developer/developer/#run-e2e-tests-locally","text":"To setup from local code, do: ./hack/quick_install.sh make undeploy make deploy-dev Go to python/kserve and install kserve python sdk deps pip3 install -e . [ test ] Then go to test/e2e . Run kubectl create namespace kserve-ci-e2e-test For KIND/minikube: Run export KSERVE_INGRESS_HOST_PORT=localhost:8080 In a different window run kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80 Note that not all tests will pass as the pytorch test requires gpu. These will show as pending pods at the end or you can add marker to skip the test. Run pytest > testresults.txt Tests may not clean up. To re-run, first do kubectl delete namespace kserve-ci-e2e-test , recreate namespace and run again.","title":"Run e2e tests locally"},{"location":"developer/developer/#iterating","text":"As you make changes to the code-base, there are two special cases to be aware of: If you change an input to generated code , then you must run make manifests . Inputs include: API type definitions in apis/serving Manifests or kustomize patches stored in config . To generate the KServe python/go clients, you should run make generate . If you want to add new dependencies , then you add the imports and the specific version of the dependency module in go.mod . When it encounters an import of a package not provided by any module in go.mod , the go command automatically looks up the module containing the package and adds it to go.mod using the latest version. If you want to upgrade the dependency , then you run go get command e.g go get golang.org/x/text to upgrade to the latest version, go get golang.org/x/text@v0.3.0 to upgrade to a specific version. make deploy-dev","title":"Iterating"},{"location":"developer/developer/#contribute-to-the-code","text":"See the guidelines for contributing a feature contributing to an existing issue","title":"Contribute to the code"},{"location":"developer/developer/#releases","text":"Please check out the documentation here to understand the release schedule and process.","title":"Releases"},{"location":"developer/developer/#feedback","text":"The best place to provide feedback about the KServe code is via a Github issue. See creating a Github issue for guidelines on submitting bugs and feature requests.","title":"Feedback"},{"location":"get_started/","text":"Getting Started with KServe \u00b6 Before you begin \u00b6 Warning KServe Quickstart Environments are for experimentation use only. For production installation, see our Administrator's Guide Before you can get started with a KServe Quickstart deployment you must install kind and the Kubernetes CLI. Install Kind (Kubernetes in Docker) \u00b6 You can use kind (Kubernetes in Docker) to run a local Kubernetes cluster with Docker container nodes. Install the Kubernetes CLI \u00b6 The Kubernetes CLI ( kubectl ) , allows you to run commands against Kubernetes clusters. You can use kubectl to deploy applications, inspect and manage cluster resources, and view logs. Install the KServe \"Quickstart\" environment \u00b6 After having kind installed, create a kind cluster with: kind create cluster Then run: kubectl config get-contexts It should list out a list of contexts you have, one of them should be kind-kind . Then run: kubectl config use-context kind-kind to use this context. You can then get started with a local deployment of KServe by using KServe Quick installation script on Kind : curl -s \"https://raw.githubusercontent.com/kserve/kserve/release-0.13/hack/quick_install.sh\" | bash or install via our published Helm Charts: helm install kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.13.0 helm install kserve oci://ghcr.io/kserve/charts/kserve --version v0.13.0","title":"KServe Quickstart"},{"location":"get_started/#getting-started-with-kserve","text":"","title":"Getting Started with KServe"},{"location":"get_started/#before-you-begin","text":"Warning KServe Quickstart Environments are for experimentation use only. For production installation, see our Administrator's Guide Before you can get started with a KServe Quickstart deployment you must install kind and the Kubernetes CLI.","title":"Before you begin"},{"location":"get_started/#install-kind-kubernetes-in-docker","text":"You can use kind (Kubernetes in Docker) to run a local Kubernetes cluster with Docker container nodes.","title":"Install Kind (Kubernetes in Docker)"},{"location":"get_started/#install-the-kubernetes-cli","text":"The Kubernetes CLI ( kubectl ) , allows you to run commands against Kubernetes clusters. You can use kubectl to deploy applications, inspect and manage cluster resources, and view logs.","title":"Install the Kubernetes CLI"},{"location":"get_started/#install-the-kserve-quickstart-environment","text":"After having kind installed, create a kind cluster with: kind create cluster Then run: kubectl config get-contexts It should list out a list of contexts you have, one of them should be kind-kind . Then run: kubectl config use-context kind-kind to use this context. You can then get started with a local deployment of KServe by using KServe Quick installation script on Kind : curl -s \"https://raw.githubusercontent.com/kserve/kserve/release-0.13/hack/quick_install.sh\" | bash or install via our published Helm Charts: helm install kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.13.0 helm install kserve oci://ghcr.io/kserve/charts/kserve --version v0.13.0","title":"Install the KServe \"Quickstart\" environment"},{"location":"get_started/first_isvc/","text":"Run your first InferenceService \u00b6 In this tutorial, you will deploy an InferenceService with a predictor that will load a scikit-learn model trained with the iris dataset. This dataset has three output classes: Iris Setosa, Iris Versicolour, and Iris Virginica. You will then send an inference request to your deployed model in order to get a prediction for the class of iris plant your request corresponds to. Since your model is being deployed as an InferenceService, not a raw Kubernetes Service, you just need to provide the storage location of the model and it gets some super powers out of the box . 1. Create a namespace \u00b6 First, create a namespace to use for deploying KServe resources: kubectl create namespace kserve-test 2. Create an InferenceService \u00b6 Next, define a new InferenceService YAML for the model and apply it to the cluster. A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: sklearn: storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF Warning Do not deploy InferenceServices in control plane namespaces (i.e. namespaces with control-plane label). The webhook is configured in a way to skip these namespaces to avoid any privilege escalations. Deploying InferenceServices to these namespaces will result in the storage initializer not being injected into the pod, causing the pod to fail with the error No such file or directory: '/mnt/models' . 3. Check InferenceService status. \u00b6 kubectl get inferenceservices sklearn-iris -n kserve-test Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-47q2g 7d23h If your DNS contains example.com please consult your admin for configuring DNS or using custom domain . 4. Determine the ingress IP and ports \u00b6 Execute the following command to determine if your kubernetes cluster is running in an environment that supports external load balancers kubectl get svc istio-ingressgateway -n istio-system Expected Output NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 172 .21.109.129 130 .211.10.121 ... 17h Load Balancer Node Port Port Forward If the EXTERNAL-IP value is set, your environment has an external load balancer that you can use for the ingress gateway. export INGRESS_HOST = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.status.loadBalancer.ingress[0].ip}' ) export INGRESS_PORT = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.spec.ports[?(@.name==\"http2\")].port}' ) If the EXTERNAL-IP value is none (or perpetually pending), your environment does not provide an external load balancer for the ingress gateway. In this case, you can access the gateway using the service\u2019s node port. # GKE export INGRESS_HOST = worker-node-address # Minikube export INGRESS_HOST = $( minikube ip ) # Other environment(On Prem) export INGRESS_HOST = $( kubectl get po -l istio = ingressgateway -n istio-system -o jsonpath = '{.items[0].status.hostIP}' ) export INGRESS_PORT = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.spec.ports[?(@.name==\"http2\")].nodePort}' ) Alternatively you can do Port Forward for testing purposes. INGRESS_GATEWAY_SERVICE = $( kubectl get svc --namespace istio-system --selector = \"app=istio-ingressgateway\" --output jsonpath = '{.items[0].metadata.name}' ) kubectl port-forward --namespace istio-system svc/ ${ INGRESS_GATEWAY_SERVICE } 8080 :80 Open another terminal, and enter the following: export INGRESS_HOST = localhost export INGRESS_PORT = 8080 5. Perform inference \u00b6 First, prepare your inference input request inside a file: cat <<EOF > \"./iris-input.json\" { \"instances\": [ [6.8, 2.8, 4.8, 1.4], [6.0, 3.4, 4.5, 1.6] ] } EOF Depending on your setup, use one of the following commands to curl the InferenceService : Real DNS Magic DNS From Ingress gateway with HOST Header From local cluster gateway If you have configured the DNS, you can directly curl the InferenceService with the URL obtained from the status print. e.g curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json If you don't want to go through the trouble to get a real domain, you can instead use \"magic\" dns xip.io . The key is to get the external IP for your cluster. kubectl get svc istio-ingressgateway --namespace istio-system Look for the EXTERNAL-IP column's value(in this case 35.237.217.209) NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .51.253.94 35 .237.217.209 Next step is to setting up the custom domain: kubectl edit cm config-domain --namespace knative-serving Now in your editor, change example.com to {{external-ip}}.xip.io (make sure to replace {{external-ip}} with the IP you found earlier). With the change applied you can now directly curl the URL curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test.35.237.217.209.xip.io/v1/models/sklearn-iris:predict -d @./iris-input.json If you do not have DNS, you can still curl with the ingress gateway external IP using the HOST Header. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -n kserve-test -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/sklearn-iris:predict\" -d @./iris-input.json If you are calling from in cluster you can curl with the internal url with host {{ InferenceServiceName }}. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test/v1/models/sklearn-iris:predict -d @./iris-input.json You should see two predictions returned (i.e. {\"predictions\": [1, 1]} ). Both sets of data points sent for inference correspond to the flower with index 1 . In this case, the model predicts that both flowers are \"Iris Versicolour\". 6. Run performance test (optional) \u00b6 If you want to load test the deployed model, try deploying the following Kubernetes Job to drive load to the model: # use kubectl create instead of apply because the job template is using generateName which doesn't work with kubectl apply kubectl create -f https://raw.githubusercontent.com/kserve/kserve/release-0.11/docs/samples/v1beta1/sklearn/v1/perf.yaml -n kserve-test Execute the following command to view output: kubectl logs load-test8b58n-rgfxr -n kserve-test Expected Output Requests [ total, rate, throughput ] 30000 , 500 .02, 499 .99 Duration [ total, attack, wait ] 1m0s, 59 .998s, 3 .336ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 1 .743ms, 2 .748ms, 2 .494ms, 3 .363ms, 4 .091ms, 7 .749ms, 46 .354ms Bytes In [ total, mean ] 690000 , 23 .00 Bytes Out [ total, mean ] 2460000 , 82 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :30000 Error Set:","title":"First InferenceService"},{"location":"get_started/first_isvc/#run-your-first-inferenceservice","text":"In this tutorial, you will deploy an InferenceService with a predictor that will load a scikit-learn model trained with the iris dataset. This dataset has three output classes: Iris Setosa, Iris Versicolour, and Iris Virginica. You will then send an inference request to your deployed model in order to get a prediction for the class of iris plant your request corresponds to. Since your model is being deployed as an InferenceService, not a raw Kubernetes Service, you just need to provide the storage location of the model and it gets some super powers out of the box .","title":"Run your first InferenceService"},{"location":"get_started/first_isvc/#1-create-a-namespace","text":"First, create a namespace to use for deploying KServe resources: kubectl create namespace kserve-test","title":"1. Create a namespace"},{"location":"get_started/first_isvc/#2-create-an-inferenceservice","text":"Next, define a new InferenceService YAML for the model and apply it to the cluster. A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: sklearn: storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF Warning Do not deploy InferenceServices in control plane namespaces (i.e. namespaces with control-plane label). The webhook is configured in a way to skip these namespaces to avoid any privilege escalations. Deploying InferenceServices to these namespaces will result in the storage initializer not being injected into the pod, causing the pod to fail with the error No such file or directory: '/mnt/models' .","title":"2. Create an InferenceService"},{"location":"get_started/first_isvc/#3-check-inferenceservice-status","text":"kubectl get inferenceservices sklearn-iris -n kserve-test Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-47q2g 7d23h If your DNS contains example.com please consult your admin for configuring DNS or using custom domain .","title":"3. Check InferenceService status."},{"location":"get_started/first_isvc/#4-determine-the-ingress-ip-and-ports","text":"Execute the following command to determine if your kubernetes cluster is running in an environment that supports external load balancers kubectl get svc istio-ingressgateway -n istio-system Expected Output NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 172 .21.109.129 130 .211.10.121 ... 17h Load Balancer Node Port Port Forward If the EXTERNAL-IP value is set, your environment has an external load balancer that you can use for the ingress gateway. export INGRESS_HOST = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.status.loadBalancer.ingress[0].ip}' ) export INGRESS_PORT = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.spec.ports[?(@.name==\"http2\")].port}' ) If the EXTERNAL-IP value is none (or perpetually pending), your environment does not provide an external load balancer for the ingress gateway. In this case, you can access the gateway using the service\u2019s node port. # GKE export INGRESS_HOST = worker-node-address # Minikube export INGRESS_HOST = $( minikube ip ) # Other environment(On Prem) export INGRESS_HOST = $( kubectl get po -l istio = ingressgateway -n istio-system -o jsonpath = '{.items[0].status.hostIP}' ) export INGRESS_PORT = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.spec.ports[?(@.name==\"http2\")].nodePort}' ) Alternatively you can do Port Forward for testing purposes. INGRESS_GATEWAY_SERVICE = $( kubectl get svc --namespace istio-system --selector = \"app=istio-ingressgateway\" --output jsonpath = '{.items[0].metadata.name}' ) kubectl port-forward --namespace istio-system svc/ ${ INGRESS_GATEWAY_SERVICE } 8080 :80 Open another terminal, and enter the following: export INGRESS_HOST = localhost export INGRESS_PORT = 8080","title":"4. Determine the ingress IP and ports"},{"location":"get_started/first_isvc/#5-perform-inference","text":"First, prepare your inference input request inside a file: cat <<EOF > \"./iris-input.json\" { \"instances\": [ [6.8, 2.8, 4.8, 1.4], [6.0, 3.4, 4.5, 1.6] ] } EOF Depending on your setup, use one of the following commands to curl the InferenceService : Real DNS Magic DNS From Ingress gateway with HOST Header From local cluster gateway If you have configured the DNS, you can directly curl the InferenceService with the URL obtained from the status print. e.g curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json If you don't want to go through the trouble to get a real domain, you can instead use \"magic\" dns xip.io . The key is to get the external IP for your cluster. kubectl get svc istio-ingressgateway --namespace istio-system Look for the EXTERNAL-IP column's value(in this case 35.237.217.209) NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .51.253.94 35 .237.217.209 Next step is to setting up the custom domain: kubectl edit cm config-domain --namespace knative-serving Now in your editor, change example.com to {{external-ip}}.xip.io (make sure to replace {{external-ip}} with the IP you found earlier). With the change applied you can now directly curl the URL curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test.35.237.217.209.xip.io/v1/models/sklearn-iris:predict -d @./iris-input.json If you do not have DNS, you can still curl with the ingress gateway external IP using the HOST Header. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -n kserve-test -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/sklearn-iris:predict\" -d @./iris-input.json If you are calling from in cluster you can curl with the internal url with host {{ InferenceServiceName }}. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test/v1/models/sklearn-iris:predict -d @./iris-input.json You should see two predictions returned (i.e. {\"predictions\": [1, 1]} ). Both sets of data points sent for inference correspond to the flower with index 1 . In this case, the model predicts that both flowers are \"Iris Versicolour\".","title":"5. Perform inference"},{"location":"get_started/first_isvc/#6-run-performance-test-optional","text":"If you want to load test the deployed model, try deploying the following Kubernetes Job to drive load to the model: # use kubectl create instead of apply because the job template is using generateName which doesn't work with kubectl apply kubectl create -f https://raw.githubusercontent.com/kserve/kserve/release-0.11/docs/samples/v1beta1/sklearn/v1/perf.yaml -n kserve-test Execute the following command to view output: kubectl logs load-test8b58n-rgfxr -n kserve-test Expected Output Requests [ total, rate, throughput ] 30000 , 500 .02, 499 .99 Duration [ total, attack, wait ] 1m0s, 59 .998s, 3 .336ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 1 .743ms, 2 .748ms, 2 .494ms, 3 .363ms, 4 .091ms, 7 .749ms, 46 .354ms Bytes In [ total, mean ] 690000 , 23 .00 Bytes Out [ total, mean ] 2460000 , 82 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :30000 Error Set:","title":"6. Run performance test (optional)"},{"location":"get_started/swagger_ui/","text":"InferenceService Swagger UI \u00b6 KServe ModelServer is built on top of FastAPI , which brings out-of-box support for OpenAPI specification and Swagger UI . Swagger UI allows visualizing and interacting with the KServe InferenceService API directly in the browser , making it easy for exploring the endpoints and validating the outputs without using any command-line tool. Enable Swagger UI \u00b6 Warning Be careful when enabling this for your production InferenceService deployments since the endpoint does not require authentication at this time. Currently, POST request only work for v2 endpoints in the UI. To enable, simply add an extra argument to the InferenceService YAML example from First Inference chapter: kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: args: [\"--enable_docs_url=True\"] modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF After the InferenceService becomes ready the Swagger UI will be served at /docs . In our example above, the Swagger UI will be available at http://sklearn-iris.kserve-test.example.com/docs . Interact with InferenceService \u00b6 Click one of the V2 endpoints like /v2 , it will expand and display the description and response from this API endpoint: Now, when you click \"Try it out\" and then \"Execute\", Swagger UI will send a GET request to the /v2 endpoint. The server response body and headers will be displayed at the bottom: Similarly, we can use Swagger UI to send request to check the model metadata and make prediction using the /v2/models/{model_name}/infer endpoint. For more reference, please check out Model Serving Data Plane for detailed documentation on the Inference Protocol.","title":"Interact with InferenceService Swagger UI"},{"location":"get_started/swagger_ui/#inferenceservice-swagger-ui","text":"KServe ModelServer is built on top of FastAPI , which brings out-of-box support for OpenAPI specification and Swagger UI . Swagger UI allows visualizing and interacting with the KServe InferenceService API directly in the browser , making it easy for exploring the endpoints and validating the outputs without using any command-line tool.","title":"InferenceService Swagger UI"},{"location":"get_started/swagger_ui/#enable-swagger-ui","text":"Warning Be careful when enabling this for your production InferenceService deployments since the endpoint does not require authentication at this time. Currently, POST request only work for v2 endpoints in the UI. To enable, simply add an extra argument to the InferenceService YAML example from First Inference chapter: kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: args: [\"--enable_docs_url=True\"] modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF After the InferenceService becomes ready the Swagger UI will be served at /docs . In our example above, the Swagger UI will be available at http://sklearn-iris.kserve-test.example.com/docs .","title":"Enable Swagger UI"},{"location":"get_started/swagger_ui/#interact-with-inferenceservice","text":"Click one of the V2 endpoints like /v2 , it will expand and display the description and response from this API endpoint: Now, when you click \"Try it out\" and then \"Execute\", Swagger UI will send a GET request to the /v2 endpoint. The server response body and headers will be displayed at the bottom: Similarly, we can use Swagger UI to send request to check the model metadata and make prediction using the /v2/models/{model_name}/infer endpoint. For more reference, please check out Model Serving Data Plane for detailed documentation on the Inference Protocol.","title":"Interact with InferenceService"},{"location":"help/contributor/github/","text":"GitHub workflow for KServe documentation \u00b6 Learn how to use GitHub and contribute to the kserve/website repo. Set up your local machine \u00b6 To check out your fork of the kserve/website repository: Create your own fork of the kserve/website repo . Configure GitHub access through SSH . Clone your fork to your machine and set the upstream remote to the kserve/website repository: mkdir -p ${ GOPATH } /src/kserve.io cd ${ GOPATH } /src/kserve.io git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /website.git cd docs git remote add upstream https://github.com/kserve/website.git git remote set-url --push upstream no_push You are now able to open PRs, start reviews, and contribute fixes the kserve/website repo. See the following sections to learn more. Important : Remember to regularly syncing your fork . Report documentation issues \u00b6 KServe uses Github issues to track documentation issues and requests. If you see a problem with the documentation that you're not sure how to fix, submit an issue using the following steps: Check the KServe docs issues list before creating an issue to avoid making a duplicate. Use the correct template for your new issue. There are two templates available: Bug report : If you're reporting an error in the existing documentation, use this template. This could be anything from broken samples to typos. When you create a bug report, include as many details as possible and include suggested fixes to the issue. Feature request : For upcoming changes to the documentation or requests for more information on a particular subject. Open PRs to fix documentation issues \u00b6 The KServe documentation follows the standard GitHub collaboration flow for Pull Requests (PRs). Ensure that your fork is up-to-date . Create a branch in your fork . Locate or create the file that you want to fix: If you are updating an existing page, locate that file and begin making changes. For example, from any page on kserve.io , you can click the pencil icon in the upper right corner to open that page in GitHub. If you are adding new content, you must follow the \"new docs\" instructions. To edit a file, use the new branch that you created in your fork. Navigate to that same file within your fork using the GitHub UI. Open that file from in your local clone. Create the Pull Request in the kserve/website repo . Assign an owner to the PR to request a review. Here's what generally happens after you send the PR for review: One of the assigned repo maintainers will triage the PR by assigning relative priority, adding appropriate labels, and performing an initial documentation review. If the PR involves significant technical changes, for example new features, or new and changed sample code, the PR is assigned to a Subject Matter Expert (SME), typically an engineer working on KServe, for technical review and approval. When both the technical writers and SMEs are satisfied with the quality of the writing and the technical accuracy of the content, the PR can be merged. A PR requires two labels before it can merge: lgtm and approved . The SME is responsible for reviewing the technical accuracy and adding the lgtm label. The KServe technical writers are who provide the approved label when the content meets quality, clarity, and organization standards (see Style Guide ). We appreciate contributions to the docs, so if you open a PR we will help you get it merged. Assigning owners and reviewers \u00b6 All PRs should be assigned to a single owner (\" Assignee \"). It's best to set the \"Assignee\" and include other stakeholders as \"Reviewers\" rather than leaving it unassigned or allowing Prow to auto assign reviewers. Use the /assign command to set the owner. For example: /assign @owner_id For code related changes , initially set the owner of your PR to the SME who should review for technical accuracy. If you don't know who the appropriate owner is, nor who your reviewers should be for your PR, you can assign the current working group lead of the related component. If you want to notify and include other stakeholders in your PR review, use the /cc command. For example: /cc @stakeholder_id1 @stakeholder_id2 Reviewing PRs \u00b6 See the KServe community guidelines about reviewing PRs Using Prow to manage PRs and Issues \u00b6 KServe uses several sets of tools to manage pull requests (PR)s and issues in a more fine-grained way than GitHub permissions allow. In particular, you'll regularly interact with Prow to categorize and manage issues and PRs. Prow allows control of specific GitHub functionality without granting full \"write\" access to the repo (which would allow rewriting history and other dangerous operations). You'll most often use the following commands, but Prow will also chime in on most bugs and PRs with a link to all the known commands: /assign @user1 @user2 to assign an issue or PR to specific people for review or approval. /lgtm and /approve to approve a PR. Note that anyone may /lgtm a PR, but only someone listed in an OWNERS file may /approve the PR. A PR needs both an approval and an LGTM -- the /lgtm review is a good opportunity for non-approvers to practice and develop reviewing skills. /lgtm is removed when a PR is updated, but /approve is sticky -- once applied, anyone can supply an /lgtm . Both Prow (legacy) and GitHub actions (preferred) can run tests on PRs; once all tests are passing and a PR has the lgtm and approved labels, Prow will submit the PR automatically. You can also use Prow to manage labels on PRs with /kind ... , /good-first-issue , or /area ... See Branches for details about how to use the /cherrypick command. Common GitHub PRs FAQs \u00b6 One or more tests are failing. If you do not see a specific error related to a change you made, and instead the errors are related to timeouts, try re-running the test at a later time. There are running tasks that could result in timeouts or rate limiting if your test runs at the same time. Other Issues/Unsure -- reach out in the Slack channel and someone will be happy to help out.","title":"GitHub workflow for KServe documentation"},{"location":"help/contributor/github/#github-workflow-for-kserve-documentation","text":"Learn how to use GitHub and contribute to the kserve/website repo.","title":"GitHub workflow for KServe documentation"},{"location":"help/contributor/github/#set-up-your-local-machine","text":"To check out your fork of the kserve/website repository: Create your own fork of the kserve/website repo . Configure GitHub access through SSH . Clone your fork to your machine and set the upstream remote to the kserve/website repository: mkdir -p ${ GOPATH } /src/kserve.io cd ${ GOPATH } /src/kserve.io git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /website.git cd docs git remote add upstream https://github.com/kserve/website.git git remote set-url --push upstream no_push You are now able to open PRs, start reviews, and contribute fixes the kserve/website repo. See the following sections to learn more. Important : Remember to regularly syncing your fork .","title":"Set up your local machine"},{"location":"help/contributor/github/#report-documentation-issues","text":"KServe uses Github issues to track documentation issues and requests. If you see a problem with the documentation that you're not sure how to fix, submit an issue using the following steps: Check the KServe docs issues list before creating an issue to avoid making a duplicate. Use the correct template for your new issue. There are two templates available: Bug report : If you're reporting an error in the existing documentation, use this template. This could be anything from broken samples to typos. When you create a bug report, include as many details as possible and include suggested fixes to the issue. Feature request : For upcoming changes to the documentation or requests for more information on a particular subject.","title":"Report documentation issues"},{"location":"help/contributor/github/#open-prs-to-fix-documentation-issues","text":"The KServe documentation follows the standard GitHub collaboration flow for Pull Requests (PRs). Ensure that your fork is up-to-date . Create a branch in your fork . Locate or create the file that you want to fix: If you are updating an existing page, locate that file and begin making changes. For example, from any page on kserve.io , you can click the pencil icon in the upper right corner to open that page in GitHub. If you are adding new content, you must follow the \"new docs\" instructions. To edit a file, use the new branch that you created in your fork. Navigate to that same file within your fork using the GitHub UI. Open that file from in your local clone. Create the Pull Request in the kserve/website repo . Assign an owner to the PR to request a review. Here's what generally happens after you send the PR for review: One of the assigned repo maintainers will triage the PR by assigning relative priority, adding appropriate labels, and performing an initial documentation review. If the PR involves significant technical changes, for example new features, or new and changed sample code, the PR is assigned to a Subject Matter Expert (SME), typically an engineer working on KServe, for technical review and approval. When both the technical writers and SMEs are satisfied with the quality of the writing and the technical accuracy of the content, the PR can be merged. A PR requires two labels before it can merge: lgtm and approved . The SME is responsible for reviewing the technical accuracy and adding the lgtm label. The KServe technical writers are who provide the approved label when the content meets quality, clarity, and organization standards (see Style Guide ). We appreciate contributions to the docs, so if you open a PR we will help you get it merged.","title":"Open PRs to fix documentation issues"},{"location":"help/contributor/github/#assigning-owners-and-reviewers","text":"All PRs should be assigned to a single owner (\" Assignee \"). It's best to set the \"Assignee\" and include other stakeholders as \"Reviewers\" rather than leaving it unassigned or allowing Prow to auto assign reviewers. Use the /assign command to set the owner. For example: /assign @owner_id For code related changes , initially set the owner of your PR to the SME who should review for technical accuracy. If you don't know who the appropriate owner is, nor who your reviewers should be for your PR, you can assign the current working group lead of the related component. If you want to notify and include other stakeholders in your PR review, use the /cc command. For example: /cc @stakeholder_id1 @stakeholder_id2","title":"Assigning owners and reviewers"},{"location":"help/contributor/github/#reviewing-prs","text":"See the KServe community guidelines about reviewing PRs","title":"Reviewing PRs"},{"location":"help/contributor/github/#using-prow-to-manage-prs-and-issues","text":"KServe uses several sets of tools to manage pull requests (PR)s and issues in a more fine-grained way than GitHub permissions allow. In particular, you'll regularly interact with Prow to categorize and manage issues and PRs. Prow allows control of specific GitHub functionality without granting full \"write\" access to the repo (which would allow rewriting history and other dangerous operations). You'll most often use the following commands, but Prow will also chime in on most bugs and PRs with a link to all the known commands: /assign @user1 @user2 to assign an issue or PR to specific people for review or approval. /lgtm and /approve to approve a PR. Note that anyone may /lgtm a PR, but only someone listed in an OWNERS file may /approve the PR. A PR needs both an approval and an LGTM -- the /lgtm review is a good opportunity for non-approvers to practice and develop reviewing skills. /lgtm is removed when a PR is updated, but /approve is sticky -- once applied, anyone can supply an /lgtm . Both Prow (legacy) and GitHub actions (preferred) can run tests on PRs; once all tests are passing and a PR has the lgtm and approved labels, Prow will submit the PR automatically. You can also use Prow to manage labels on PRs with /kind ... , /good-first-issue , or /area ... See Branches for details about how to use the /cherrypick command.","title":"Using Prow to manage PRs and Issues"},{"location":"help/contributor/github/#common-github-prs-faqs","text":"One or more tests are failing. If you do not see a specific error related to a change you made, and instead the errors are related to timeouts, try re-running the test at a later time. There are running tasks that could result in timeouts or rate limiting if your test runs at the same time. Other Issues/Unsure -- reach out in the Slack channel and someone will be happy to help out.","title":"Common GitHub PRs FAQs"},{"location":"help/contributor/mkdocs-contributor-guide/","text":"MkDocs Contributions \u00b6 This is a temporary home for contribution guidelines for the MkDocs branch. When MkDocs becomes \"main\" this will be moved to the appropriate place on the website Install Material for MkDocs \u00b6 kserve.io uses Material for MkDocs to render documentation. Material for MkDocs is Python based and uses pip to install most of it's required packages as well as optional add-ons (which we use). You can choose to install MkDocs locally or using a Docker image. pip actually comes pre-installed with Python so it is included in many operating systems (like MacOSx or Ubuntu) but if you don\u2019t have Python, you can install it here: https://www.python.org For some (e.g. folks using RHEL), you may have to use pip3. pip pip3 pip install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation pip3 install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation Install KServe-Specific Extensions \u00b6 KServe uses a number of extensions to MkDocs which can also be installed using pip. If you used pip to install, run the following: pip pip3 pip install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects pip3 install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects Install Dependencies in Requirements.txt file \u00b6 Navigate to root folder and run below command to install required packages and libraries specified in the requirements.txt file. pip pip3 pip install -r requirements.txt pip3 install -r requirements.txt Setting Up Local Preview \u00b6 Once you have installed Material for MkDocs and all of the extensions, head over to and clone the repo. In your terminal, find your way over to the location of the cloned repo. Once you are in the main folder and run: Local Preview Local Preview w/ Dirty Reload Local Preview including Blog and Community Site mkdocs serve If you\u2019re only changing a single page in the /docs/ folder (i.e. not the homepage or mkdocs.yml) adding the flag --dirtyreload will make the site rebuild super crazy insta-fast. mkdocs serve --dirtyreload First, install the necessary extensions: npm install -g postcss postcss-cli autoprefixer http-server Once you have those npm packages installed, run: ./hack/build-with-blog.sh serve Note Unfortunately, there aren\u2019t live previews for this version of the local preview. After awhile, your terminal should spit out: INFO - Documentation built in 13 .54 seconds [ I 210519 10 :47:10 server:335 ] Serving on http://127.0.0.1:8000 [ I 210519 10 :47:10 handlers:62 ] Start watching changes [ I 210519 10 :47:10 handlers:64 ] Start detecting changes Now access http://127.0.0.1:8000 and you should see the site is built! \ud83c\udf89 Anytime you change any file in your /docs/ repo and hit save, the site will automatically rebuild itself to reflect your changes! Setting Up \"Public\" Preview \u00b6 If, for whatever reason, you want to share your work before submitting a PR (where Netlify would generate a preview for you), you can deploy your changes as a Github Page easily using the following command: mkdocs gh-deploy --force INFO - Documentation built in 14 .29 seconds WARNING - Version check skipped: No version specified in previous deployment. INFO - Your documentation should shortly be available at: https://<your-github-handle>.github.io/docs/ Where <your-github-handle> is your Github handle. After a few moments, your changes should be available for public preview at the link provided by MkDocs! This means you can rapidly prototype and share your changes before making a PR! Navigation \u00b6 Navigation in MkDocs uses the \"mkdocs.yml\" file (found in the /docs directory) to organize navigation. For more in-depth information on Navigation, see: https://www.mkdocs.org/user-guide/writing-your-docs/#configure-pages-and-navigation and https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/ Content Tabs \u00b6 Content tabs are handy way to organize lots of information in a visually pleasing way. Some documentation from https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage is reproduced here: Grouping Code blocks Grouping other content Code blocks are one of the primary targets to be grouped, and can be considered a special case of content tabs, as tabs with a single code block are always rendered without horizontal spacing. Example: === \"C\" ``` c #include <stdio.h> int main(void) { printf(\"Hello world!\\n\"); return 0; } ``` === \"C++\" ``` c++ #include <iostream> int main(void) { std::cout << \"Hello world!\" << std::endl; return 0; } ``` Result: C C++ #include <stdio.h> int main ( void ) { printf ( \"Hello world! \\n \" ); return 0 ; } #include <iostream> int main ( void ) { std :: cout << \"Hello world!\" << std :: endl ; return 0 ; } When a content tab contains more than one code block, it is rendered with horizontal spacing. Vertical spacing is never added, but can be achieved by nesting tabs in other blocks. Example: === \"Unordered list\" * Sed sagittis eleifend rutrum * Donec vitae suscipit est * Nulla tempor lobortis orci === \"Ordered list\" 1. Sed sagittis eleifend rutrum 2. Donec vitae suscipit est 3. Nulla tempor lobortis orci Result: Unordered list Ordered list Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci For more information, see: https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage File Includes (Content Reuse) \u00b6 KServe strives to reduce duplicative effort by reusing commonly used bits of information, see the docs/snippet directory for some examples. Snippets does not require a specific extension, and as long as a valid file name is specified, it will attempt to process it. Snippets can handle recursive file inclusion. And if Snippets encounters the same file in the current stack, it will avoid re-processing it in order to avoid an infinite loop (or crash on hitting max recursion depth). For more info, see: https://facelessuser.github.io/pymdown-extensions/extensions/snippets/ Admonitions \u00b6 We use the following admonition boxes only. Use admonitions sparingly; too many admonitions can be distracting. Admonitions Formatting Note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. Tip A Tip suggests an helpful, but not mandatory, action to take. Warning A Warning draws attention to potential trouble. !!! note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. !!! tip A Tip suggests a helpful, but not mandatory, action to take. !!! warning A Warning draws attention to potential trouble. Icons and Emojis \u00b6 Material for MkDocs supports using Material Icons and Emojis using easy shortcodes. Emojs Formatting :taco: To search a database of Icons and Emojis (all of which can be used on kserve.io), as well as usage information, see: https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/#search Redirects \u00b6 The KServe site uses mkdocs-redirects to \"redirect\" users from a page that may no longer exist (or has been moved) to their desired location. Adding re-directs to the KServe site is done in one centralized place, docs/config/redirects.yml . The format is shown here: plugins: redirects: redirect_maps: ... path_to_old_or_moved_URL : path_to_new_URL","title":"MkDocs Contributions"},{"location":"help/contributor/mkdocs-contributor-guide/#mkdocs-contributions","text":"This is a temporary home for contribution guidelines for the MkDocs branch. When MkDocs becomes \"main\" this will be moved to the appropriate place on the website","title":"MkDocs Contributions"},{"location":"help/contributor/mkdocs-contributor-guide/#install-material-for-mkdocs","text":"kserve.io uses Material for MkDocs to render documentation. Material for MkDocs is Python based and uses pip to install most of it's required packages as well as optional add-ons (which we use). You can choose to install MkDocs locally or using a Docker image. pip actually comes pre-installed with Python so it is included in many operating systems (like MacOSx or Ubuntu) but if you don\u2019t have Python, you can install it here: https://www.python.org For some (e.g. folks using RHEL), you may have to use pip3. pip pip3 pip install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation pip3 install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation","title":"Install Material for MkDocs"},{"location":"help/contributor/mkdocs-contributor-guide/#install-kserve-specific-extensions","text":"KServe uses a number of extensions to MkDocs which can also be installed using pip. If you used pip to install, run the following: pip pip3 pip install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects pip3 install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects","title":"Install KServe-Specific Extensions"},{"location":"help/contributor/mkdocs-contributor-guide/#install-dependencies-in-requirementstxt-file","text":"Navigate to root folder and run below command to install required packages and libraries specified in the requirements.txt file. pip pip3 pip install -r requirements.txt pip3 install -r requirements.txt","title":"Install Dependencies in Requirements.txt file"},{"location":"help/contributor/mkdocs-contributor-guide/#setting-up-local-preview","text":"Once you have installed Material for MkDocs and all of the extensions, head over to and clone the repo. In your terminal, find your way over to the location of the cloned repo. Once you are in the main folder and run: Local Preview Local Preview w/ Dirty Reload Local Preview including Blog and Community Site mkdocs serve If you\u2019re only changing a single page in the /docs/ folder (i.e. not the homepage or mkdocs.yml) adding the flag --dirtyreload will make the site rebuild super crazy insta-fast. mkdocs serve --dirtyreload First, install the necessary extensions: npm install -g postcss postcss-cli autoprefixer http-server Once you have those npm packages installed, run: ./hack/build-with-blog.sh serve Note Unfortunately, there aren\u2019t live previews for this version of the local preview. After awhile, your terminal should spit out: INFO - Documentation built in 13 .54 seconds [ I 210519 10 :47:10 server:335 ] Serving on http://127.0.0.1:8000 [ I 210519 10 :47:10 handlers:62 ] Start watching changes [ I 210519 10 :47:10 handlers:64 ] Start detecting changes Now access http://127.0.0.1:8000 and you should see the site is built! \ud83c\udf89 Anytime you change any file in your /docs/ repo and hit save, the site will automatically rebuild itself to reflect your changes!","title":"Setting Up Local Preview"},{"location":"help/contributor/mkdocs-contributor-guide/#setting-up-public-preview","text":"If, for whatever reason, you want to share your work before submitting a PR (where Netlify would generate a preview for you), you can deploy your changes as a Github Page easily using the following command: mkdocs gh-deploy --force INFO - Documentation built in 14 .29 seconds WARNING - Version check skipped: No version specified in previous deployment. INFO - Your documentation should shortly be available at: https://<your-github-handle>.github.io/docs/ Where <your-github-handle> is your Github handle. After a few moments, your changes should be available for public preview at the link provided by MkDocs! This means you can rapidly prototype and share your changes before making a PR!","title":"Setting Up \"Public\" Preview"},{"location":"help/contributor/mkdocs-contributor-guide/#navigation","text":"Navigation in MkDocs uses the \"mkdocs.yml\" file (found in the /docs directory) to organize navigation. For more in-depth information on Navigation, see: https://www.mkdocs.org/user-guide/writing-your-docs/#configure-pages-and-navigation and https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/","title":"Navigation"},{"location":"help/contributor/mkdocs-contributor-guide/#content-tabs","text":"Content tabs are handy way to organize lots of information in a visually pleasing way. Some documentation from https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage is reproduced here: Grouping Code blocks Grouping other content Code blocks are one of the primary targets to be grouped, and can be considered a special case of content tabs, as tabs with a single code block are always rendered without horizontal spacing. Example: === \"C\" ``` c #include <stdio.h> int main(void) { printf(\"Hello world!\\n\"); return 0; } ``` === \"C++\" ``` c++ #include <iostream> int main(void) { std::cout << \"Hello world!\" << std::endl; return 0; } ``` Result: C C++ #include <stdio.h> int main ( void ) { printf ( \"Hello world! \\n \" ); return 0 ; } #include <iostream> int main ( void ) { std :: cout << \"Hello world!\" << std :: endl ; return 0 ; } When a content tab contains more than one code block, it is rendered with horizontal spacing. Vertical spacing is never added, but can be achieved by nesting tabs in other blocks. Example: === \"Unordered list\" * Sed sagittis eleifend rutrum * Donec vitae suscipit est * Nulla tempor lobortis orci === \"Ordered list\" 1. Sed sagittis eleifend rutrum 2. Donec vitae suscipit est 3. Nulla tempor lobortis orci Result: Unordered list Ordered list Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci For more information, see: https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage","title":"Content Tabs"},{"location":"help/contributor/mkdocs-contributor-guide/#file-includes-content-reuse","text":"KServe strives to reduce duplicative effort by reusing commonly used bits of information, see the docs/snippet directory for some examples. Snippets does not require a specific extension, and as long as a valid file name is specified, it will attempt to process it. Snippets can handle recursive file inclusion. And if Snippets encounters the same file in the current stack, it will avoid re-processing it in order to avoid an infinite loop (or crash on hitting max recursion depth). For more info, see: https://facelessuser.github.io/pymdown-extensions/extensions/snippets/","title":"File Includes (Content Reuse)"},{"location":"help/contributor/mkdocs-contributor-guide/#admonitions","text":"We use the following admonition boxes only. Use admonitions sparingly; too many admonitions can be distracting. Admonitions Formatting Note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. Tip A Tip suggests an helpful, but not mandatory, action to take. Warning A Warning draws attention to potential trouble. !!! note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. !!! tip A Tip suggests a helpful, but not mandatory, action to take. !!! warning A Warning draws attention to potential trouble.","title":"Admonitions"},{"location":"help/contributor/mkdocs-contributor-guide/#icons-and-emojis","text":"Material for MkDocs supports using Material Icons and Emojis using easy shortcodes. Emojs Formatting :taco: To search a database of Icons and Emojis (all of which can be used on kserve.io), as well as usage information, see: https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/#search","title":"Icons and Emojis"},{"location":"help/contributor/mkdocs-contributor-guide/#redirects","text":"The KServe site uses mkdocs-redirects to \"redirect\" users from a page that may no longer exist (or has been moved) to their desired location. Adding re-directs to the KServe site is done in one centralized place, docs/config/redirects.yml . The format is shown here: plugins: redirects: redirect_maps: ... path_to_old_or_moved_URL : path_to_new_URL","title":"Redirects"},{"location":"help/contributor/templates/template-blog/","text":"Blog template instructions \u00b6 An example template with best-practices that you can use to start drafting an entry to post on the KServe blog. Copy a version of this template without the instructions Include a commented-out table with tracking info about reviews and approvals: <!-- | Reviewer | Date | Approval | | ------------------ | ---------- | ------------- | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> Blog content body \u00b6 <!-- Introduce the feature you are going to explain: * state what the goal of this blog entry is * how you use the feature * make sure to link to the corresponding docs * why others can find it useful (why its important) --> <!-- Add/create as many distinct Steps or Sections as needed. --> Example step/section 1: \u00b6 <!-- An introductory sentence about this step or section (ie. why its important and what the result is). Don't forget to link to any new or related concepts that you mention here. --> Example step/section 2: \u00b6 <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> Example step/section 3: \u00b6 <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> Example section about results \u00b6 <!-- Tie it all together and briefly revisit the main key points and then the overall result/goal/importance --> Further reading \u00b6 <!-- Add any links to other related resources that users might find useful. What's the next step? --> About the author \u00b6 <!-- Add a short bio of yourself here --> Copy the template \u00b6 <!-- | Reviewer | Date | Approval | | ------------------ | ---------- | ------------- | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> # <!-- Insert blog title here --> ## Blog content body <!-- Introduce the feature you are going to explain: * state what the goal of this blog entry is * how you use the feature * make sure to link to the corresponding docs * why others can find it useful (why its important) --> <!-- Add/create as many distinct Steps or Sections as needed. --> ### Example step/section 1: <!-- An introductory sentence about this step or section (ie. why its important and what the result is). Don't forget to link to any new or related concepts that you mention here. --> ### Example step/section 2: <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> ### Example step/section 3: <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> ### Example section about results <!-- Tie it all together and briefly revisit the main key points and then the overall result/goal/importance --> ## Further reading <!-- Add any links to related resources that users might find useful. What's the next step? --> ## About the author <!-- Add a short bio of yourself here -->","title":"Blog template instructions"},{"location":"help/contributor/templates/template-blog/#blog-template-instructions","text":"An example template with best-practices that you can use to start drafting an entry to post on the KServe blog. Copy a version of this template without the instructions Include a commented-out table with tracking info about reviews and approvals: <!-- | Reviewer | Date | Approval | | ------------------ | ---------- | ------------- | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | -->","title":"Blog template instructions"},{"location":"help/contributor/templates/template-blog/#blog-content-body","text":"<!-- Introduce the feature you are going to explain: * state what the goal of this blog entry is * how you use the feature * make sure to link to the corresponding docs * why others can find it useful (why its important) --> <!-- Add/create as many distinct Steps or Sections as needed. -->","title":"Blog content body"},{"location":"help/contributor/templates/template-blog/#example-stepsection-1","text":"<!-- An introductory sentence about this step or section (ie. why its important and what the result is). Don't forget to link to any new or related concepts that you mention here. -->","title":"Example step/section 1:"},{"location":"help/contributor/templates/template-blog/#example-stepsection-2","text":"<!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. -->","title":"Example step/section 2:"},{"location":"help/contributor/templates/template-blog/#example-stepsection-3","text":"<!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. -->","title":"Example step/section 3:"},{"location":"help/contributor/templates/template-blog/#example-section-about-results","text":"<!-- Tie it all together and briefly revisit the main key points and then the overall result/goal/importance -->","title":"Example section about results"},{"location":"help/contributor/templates/template-blog/#further-reading","text":"<!-- Add any links to other related resources that users might find useful. What's the next step? -->","title":"Further reading"},{"location":"help/contributor/templates/template-blog/#about-the-author","text":"<!-- Add a short bio of yourself here -->","title":"About the author"},{"location":"help/contributor/templates/template-blog/#copy-the-template","text":"<!-- | Reviewer | Date | Approval | | ------------------ | ---------- | ------------- | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> # <!-- Insert blog title here --> ## Blog content body <!-- Introduce the feature you are going to explain: * state what the goal of this blog entry is * how you use the feature * make sure to link to the corresponding docs * why others can find it useful (why its important) --> <!-- Add/create as many distinct Steps or Sections as needed. --> ### Example step/section 1: <!-- An introductory sentence about this step or section (ie. why its important and what the result is). Don't forget to link to any new or related concepts that you mention here. --> ### Example step/section 2: <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> ### Example step/section 3: <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> ### Example section about results <!-- Tie it all together and briefly revisit the main key points and then the overall result/goal/importance --> ## Further reading <!-- Add any links to related resources that users might find useful. What's the next step? --> ## About the author <!-- Add a short bio of yourself here -->","title":"Copy the template"},{"location":"help/contributor/templates/template-concept/","text":"Concept Template \u00b6 Use this template when writing conceptual topics. Conceptual topics explain how things work or what things mean. They provide helpful context to readers. They do not include procedures. Template \u00b6 The following template includes the standard sections that should appear in conceptual topics, including a topic introduction sentence, an overview, and placeholders for additional sections and subsections. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic describes what KServe is and how it works.\" ## Overview Write a few sentences describing the subject of the topic. ## Section Title Write a sentence or two to describe the content in this section. Create more sections as necessary. Optionally, add two or more subsections to each section. Do not skip header levels: H2 >> H3, not H2 >> H4. ### Subsection Title Write a sentence or two to describe the content in this section. ### Subsection Title Write a sentence or two to describe the content in this section. Conceptual Content Samples \u00b6 This section provides common content types that appear in conceptual topics. Copy and paste the markdown to use it in your topic. Table \u00b6 Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.\u201d Markdown Table Template \u00b6 Header 1 Header 2 Data1 Data2 Data3 Data4 Ordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item. Markdown Ordered List Templates \u00b6 Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3 Unordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item. Markdown Unordered List Template \u00b6 List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item Note \u00b6 Ensure the text beneath the note is indented as much as note is. Note This is a note. Warning \u00b6 If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Concept Template"},{"location":"help/contributor/templates/template-concept/#concept-template","text":"Use this template when writing conceptual topics. Conceptual topics explain how things work or what things mean. They provide helpful context to readers. They do not include procedures.","title":"Concept Template"},{"location":"help/contributor/templates/template-concept/#template","text":"The following template includes the standard sections that should appear in conceptual topics, including a topic introduction sentence, an overview, and placeholders for additional sections and subsections. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic describes what KServe is and how it works.\" ## Overview Write a few sentences describing the subject of the topic. ## Section Title Write a sentence or two to describe the content in this section. Create more sections as necessary. Optionally, add two or more subsections to each section. Do not skip header levels: H2 >> H3, not H2 >> H4. ### Subsection Title Write a sentence or two to describe the content in this section. ### Subsection Title Write a sentence or two to describe the content in this section.","title":"Template"},{"location":"help/contributor/templates/template-concept/#conceptual-content-samples","text":"This section provides common content types that appear in conceptual topics. Copy and paste the markdown to use it in your topic.","title":"Conceptual Content Samples"},{"location":"help/contributor/templates/template-concept/#table","text":"Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.\u201d","title":"Table"},{"location":"help/contributor/templates/template-concept/#markdown-table-template","text":"Header 1 Header 2 Data1 Data2 Data3 Data4","title":"Markdown Table Template"},{"location":"help/contributor/templates/template-concept/#ordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item.","title":"Ordered List"},{"location":"help/contributor/templates/template-concept/#markdown-ordered-list-templates","text":"Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3","title":"Markdown Ordered List Templates"},{"location":"help/contributor/templates/template-concept/#unordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item.","title":"Unordered List"},{"location":"help/contributor/templates/template-concept/#markdown-unordered-list-template","text":"List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item","title":"Markdown Unordered List Template"},{"location":"help/contributor/templates/template-concept/#note","text":"Ensure the text beneath the note is indented as much as note is. Note This is a note.","title":"Note"},{"location":"help/contributor/templates/template-concept/#warning","text":"If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Warning"},{"location":"help/contributor/templates/template-procedure/","text":"Procedure template \u00b6 Use this template when writing procedural (how-to) topics. Procedural topics include detailed steps to perform a task as well as some context about the task. Template \u00b6 The following template includes the standard sections that should appear in procedural topics, including a topic sentence, an overview section, and sections for each task within the procedure. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic instructs how to serve a TensorFlow model.\" ## Overview Write a few sentences to describe the subject of the topic, if useful. For example, if the topic is about configuring a broker, you might provide some useful context about brokers. If there are multiple tasks in the procedure and they must be completed in order, create an ordered list that contains each task in the topic. Use bullets for sub-tasks. Include anchor links to the headings for each task. To [task]: 1. [Name of Task 1 (for example, Apply default configuration)](#task-1) 1. [Optional: Name of Task 2](#task-2) !!! note Unless the number of tasks in the procedure is particularly high, do not use numbered lead-ins in the task headings. For example, instead of \"Task 1: Apply default configuration\", use \"Apply default configuration\". ## Prerequisites Use one of the following formats for the Prerequisites section. ### Formatting for two or more prerequisites If there are two or more prerequisites, use the following format. Include links for more information, if necessary. Before you [task], you must have/do: * Prerequisite. See [Link](). * Prerequisite. See [Link](). For example: Before you deploy PyTorch model, you must have: * KServe. See [Installing the KServe](link-to-that-topic). * An Apache Kafka cluster. See [Link to Instructions to Download](link-to-that-topic). ### Format for one prerequisite If there is one prerequisite, use the following format. Include a link for more information, if necessary. Before you [task], you must have/do [prerequisite]. See [Link](link). For example: Before you create the `InferenceService`, you must have a Kubernetes cluster with KServe installed and DNS configured. See the [installation instructions](../../../install/README.md) if you need to create one. ## Task 1 Write a few sentences to describe the task and provide additional context on the task. !!! note When writing a single-step procedure, write the step in one sentence and make it a bullet. The signposting is important given readers are strongly inclined to look for numbered steps and bullet points when searching for instructions. If possible, expand the procedure to include at least one more step. Few procedures truly require a single step. [Task]: 1. Step 1 1. Step 2 ## Optional: Task 2 If the task is optional, put \"Optional:\" in the heading. Write a few sentences to describe the task and provide additional context on the task. [Task]: 1. Step 1 2. Step 2 Procedure Content Samples \u00b6 This section provides common content types that appear in procedural topics. Copy and paste the markdown to use it in your topic. \u201cFill-in-the-Fields\u201d Table \u00b6 Where the reader must enter many values in, for example, a YAML file, use a table within the procedure as follows: Open the YAML file. Key1 : Value1 Key2 : Value2 metadata : annotations : # case-sensitive Key3 : Value3 Key4 : Value4 Key5 : Value5 spec : # Configuration specific to this broker. config : Key6 : Value6 Change the relevant values to your needs, using the following table as a guide. Key Value Type Description Key1 String Description Key2 Integer Description Key3 String Description Key4 String Description Key5 Float Description Key6 String Description Table \u00b6 Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework. Markdown Table Template \u00b6 Header 1 Header 2 Data1 Data2 Data3 Data4 Ordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item. Markdown Ordered List Templates \u00b6 Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3 Unordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item. Markdown Unordered List Template \u00b6 List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item Note \u00b6 Ensure the text beneath the note is indented as much as note is. Note This is a note. Warning \u00b6 If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning. Markdown Embedded Image \u00b6 The following is an embedded image reference in markdown. Tabs \u00b6 Place multiple versions of the same procedure (such as a CLI procedure vs a YAML procedure) within tabs. Indent the opening tabs tags 3 spaces to make the tabs display properly. == \"tab1 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. == \"tab2 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. Documenting Code and Code Snippets \u00b6 For instructions on how to format code and code snippets, see the Style Guide.","title":"Procedure template"},{"location":"help/contributor/templates/template-procedure/#procedure-template","text":"Use this template when writing procedural (how-to) topics. Procedural topics include detailed steps to perform a task as well as some context about the task.","title":"Procedure template"},{"location":"help/contributor/templates/template-procedure/#template","text":"The following template includes the standard sections that should appear in procedural topics, including a topic sentence, an overview section, and sections for each task within the procedure. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic instructs how to serve a TensorFlow model.\" ## Overview Write a few sentences to describe the subject of the topic, if useful. For example, if the topic is about configuring a broker, you might provide some useful context about brokers. If there are multiple tasks in the procedure and they must be completed in order, create an ordered list that contains each task in the topic. Use bullets for sub-tasks. Include anchor links to the headings for each task. To [task]: 1. [Name of Task 1 (for example, Apply default configuration)](#task-1) 1. [Optional: Name of Task 2](#task-2) !!! note Unless the number of tasks in the procedure is particularly high, do not use numbered lead-ins in the task headings. For example, instead of \"Task 1: Apply default configuration\", use \"Apply default configuration\". ## Prerequisites Use one of the following formats for the Prerequisites section. ### Formatting for two or more prerequisites If there are two or more prerequisites, use the following format. Include links for more information, if necessary. Before you [task], you must have/do: * Prerequisite. See [Link](). * Prerequisite. See [Link](). For example: Before you deploy PyTorch model, you must have: * KServe. See [Installing the KServe](link-to-that-topic). * An Apache Kafka cluster. See [Link to Instructions to Download](link-to-that-topic). ### Format for one prerequisite If there is one prerequisite, use the following format. Include a link for more information, if necessary. Before you [task], you must have/do [prerequisite]. See [Link](link). For example: Before you create the `InferenceService`, you must have a Kubernetes cluster with KServe installed and DNS configured. See the [installation instructions](../../../install/README.md) if you need to create one. ## Task 1 Write a few sentences to describe the task and provide additional context on the task. !!! note When writing a single-step procedure, write the step in one sentence and make it a bullet. The signposting is important given readers are strongly inclined to look for numbered steps and bullet points when searching for instructions. If possible, expand the procedure to include at least one more step. Few procedures truly require a single step. [Task]: 1. Step 1 1. Step 2 ## Optional: Task 2 If the task is optional, put \"Optional:\" in the heading. Write a few sentences to describe the task and provide additional context on the task. [Task]: 1. Step 1 2. Step 2","title":"Template"},{"location":"help/contributor/templates/template-procedure/#procedure-content-samples","text":"This section provides common content types that appear in procedural topics. Copy and paste the markdown to use it in your topic.","title":"Procedure Content Samples"},{"location":"help/contributor/templates/template-procedure/#fill-in-the-fields-table","text":"Where the reader must enter many values in, for example, a YAML file, use a table within the procedure as follows: Open the YAML file. Key1 : Value1 Key2 : Value2 metadata : annotations : # case-sensitive Key3 : Value3 Key4 : Value4 Key5 : Value5 spec : # Configuration specific to this broker. config : Key6 : Value6 Change the relevant values to your needs, using the following table as a guide. Key Value Type Description Key1 String Description Key2 Integer Description Key3 String Description Key4 String Description Key5 Float Description Key6 String Description","title":"\u201cFill-in-the-Fields\u201d Table"},{"location":"help/contributor/templates/template-procedure/#table","text":"Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.","title":"Table"},{"location":"help/contributor/templates/template-procedure/#markdown-table-template","text":"Header 1 Header 2 Data1 Data2 Data3 Data4","title":"Markdown Table Template"},{"location":"help/contributor/templates/template-procedure/#ordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item.","title":"Ordered List"},{"location":"help/contributor/templates/template-procedure/#markdown-ordered-list-templates","text":"Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3","title":"Markdown Ordered List Templates"},{"location":"help/contributor/templates/template-procedure/#unordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item.","title":"Unordered List"},{"location":"help/contributor/templates/template-procedure/#markdown-unordered-list-template","text":"List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item","title":"Markdown Unordered List Template"},{"location":"help/contributor/templates/template-procedure/#note","text":"Ensure the text beneath the note is indented as much as note is. Note This is a note.","title":"Note"},{"location":"help/contributor/templates/template-procedure/#warning","text":"If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Warning"},{"location":"help/contributor/templates/template-procedure/#markdown-embedded-image","text":"The following is an embedded image reference in markdown.","title":"Markdown Embedded Image"},{"location":"help/contributor/templates/template-procedure/#tabs","text":"Place multiple versions of the same procedure (such as a CLI procedure vs a YAML procedure) within tabs. Indent the opening tabs tags 3 spaces to make the tabs display properly. == \"tab1 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. == \"tab2 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step.","title":"Tabs"},{"location":"help/contributor/templates/template-procedure/#documenting-code-and-code-snippets","text":"For instructions on how to format code and code snippets, see the Style Guide.","title":"Documenting Code and Code Snippets"},{"location":"help/contributor/templates/template-troubleshooting/","text":"Troubleshooting template \u00b6 When writing guidance to help to troubleshoot specific errors, the error must include: Error Description: To describe the error very briefly so that users can search for it easily. Symptom: To describe the error in a way that helps users to diagnose their issue. Include error messages or anything else users might see if they encounter this error. Explanation (or cause): To inform users about why they are seeing this error. This can be omitted if the cause of the error is unknown. Solution: To inform the user about how to fix the error. Example Troubleshooting Table \u00b6 Troubleshooting \u00b6 | Error Description | |----------|------------| | Symptom | During the event something breaks. | | Cause | The thing is broken. | | Solution | To solve this issue, do the following: 1. This. 2. That. |","title":"Troubleshooting template"},{"location":"help/contributor/templates/template-troubleshooting/#troubleshooting-template","text":"When writing guidance to help to troubleshoot specific errors, the error must include: Error Description: To describe the error very briefly so that users can search for it easily. Symptom: To describe the error in a way that helps users to diagnose their issue. Include error messages or anything else users might see if they encounter this error. Explanation (or cause): To inform users about why they are seeing this error. This can be omitted if the cause of the error is unknown. Solution: To inform the user about how to fix the error.","title":"Troubleshooting template"},{"location":"help/contributor/templates/template-troubleshooting/#example-troubleshooting-table","text":"","title":"Example Troubleshooting Table"},{"location":"help/contributor/templates/template-troubleshooting/#troubleshooting","text":"| Error Description | |----------|------------| | Symptom | During the event something breaks. | | Cause | The thing is broken. | | Solution | To solve this issue, do the following: 1. This. 2. That. |","title":"Troubleshooting"},{"location":"help/style-guide/documenting-code/","text":"Documenting Code \u00b6 Words requiring code formatting \u00b6 Apply code formatting only to special-purpose text: Filenames Path names Fields and values from a YAML file Any text that goes into a CLI CLI names Specify the programming language \u00b6 Specify the language your code is in as part of the code block Specify non-language specific code, like CLI commands, with ```bash. See the following examples for formatting. Correct Incorrect Correct Formatting Incorrect Formatting package main import \"fmt\" func main () { fmt . Println ( \"hello world\" ) } package main import \"fmt\" func main () { fmt.Println ( \"hello world\" ) } ```go package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` ```bash package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` Documenting YAML \u00b6 When documenting YAML, use two steps. Use step 1 to create the YAML file, and step 2 to apply the YAML file. Use kubectl apply for files/objects that the user creates: it works for both \u201ccreate\u201d and \u201cupdate\u201d, and the source of truth is their local files. Use kubectl edit for files which are shipped as part of the KServe software, like the KServe ConfigMaps. Write ```yaml at the beginning of your code block if you are typing YAML code as part of a CLI command. Correct Incorrect Creating or updating a resource: Create a YAML file using the following template: # YAML FILE CONTENTS Apply the YAML file by running the command: kubectl apply -f <filename>.yaml Where <filename> is the name of the file you created in the previous step. Editing a ConfigMap: kubectl -n <namespace> edit configmap <resource-name> Example 1: cat <<EOF | kubectl create -f - # code EOF Example 2: kubectl apply -f - <<EOF # code EOF Referencing variables in code blocks \u00b6 Format variables in code blocks like so: All lowercase Hyphens between words Explanation for each variable below code block Explanation format is \u201cWhere... <service-name> is\u2026\" Single variable \u00b6 Correct Incorrect kubectl get isvc <service-name> Where <service-name> is the name of your InferenceService. kubectl get isvc { SERVICE_NAME } {SERVICE_NAME} = The name of your service Multiple variables \u00b6 Correct Incorrect kn create service <service-name> --revision-name <revision-name> Where: <service-name> is the name of your Knative Service. <revision-name> is the desired name of your revision. kn create service <service-name> --revision-name <revision-name> Where <service-name> is the name of your Knative Service. Where <revision-name> is the desired name of your revision. CLI output \u00b6 CLI Output should include the custom css \"{ .bash .no-copy }\" in place of \"bash\" which removes the \"Copy to clipboard button\" on the right side of the code block Correct Incorrect Correct Formatting Incorrect Formatting <some-code> <some-code> ```{ .bash .no-copy } <some-code> ``` ```bash <some-code> ```","title":"Documenting Code"},{"location":"help/style-guide/documenting-code/#documenting-code","text":"","title":"Documenting Code"},{"location":"help/style-guide/documenting-code/#words-requiring-code-formatting","text":"Apply code formatting only to special-purpose text: Filenames Path names Fields and values from a YAML file Any text that goes into a CLI CLI names","title":"Words requiring code formatting"},{"location":"help/style-guide/documenting-code/#specify-the-programming-language","text":"Specify the language your code is in as part of the code block Specify non-language specific code, like CLI commands, with ```bash. See the following examples for formatting. Correct Incorrect Correct Formatting Incorrect Formatting package main import \"fmt\" func main () { fmt . Println ( \"hello world\" ) } package main import \"fmt\" func main () { fmt.Println ( \"hello world\" ) } ```go package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` ```bash package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ```","title":"Specify the programming language"},{"location":"help/style-guide/documenting-code/#documenting-yaml","text":"When documenting YAML, use two steps. Use step 1 to create the YAML file, and step 2 to apply the YAML file. Use kubectl apply for files/objects that the user creates: it works for both \u201ccreate\u201d and \u201cupdate\u201d, and the source of truth is their local files. Use kubectl edit for files which are shipped as part of the KServe software, like the KServe ConfigMaps. Write ```yaml at the beginning of your code block if you are typing YAML code as part of a CLI command. Correct Incorrect Creating or updating a resource: Create a YAML file using the following template: # YAML FILE CONTENTS Apply the YAML file by running the command: kubectl apply -f <filename>.yaml Where <filename> is the name of the file you created in the previous step. Editing a ConfigMap: kubectl -n <namespace> edit configmap <resource-name> Example 1: cat <<EOF | kubectl create -f - # code EOF Example 2: kubectl apply -f - <<EOF # code EOF","title":"Documenting YAML"},{"location":"help/style-guide/documenting-code/#referencing-variables-in-code-blocks","text":"Format variables in code blocks like so: All lowercase Hyphens between words Explanation for each variable below code block Explanation format is \u201cWhere... <service-name> is\u2026\"","title":"Referencing variables in code blocks"},{"location":"help/style-guide/documenting-code/#single-variable","text":"Correct Incorrect kubectl get isvc <service-name> Where <service-name> is the name of your InferenceService. kubectl get isvc { SERVICE_NAME } {SERVICE_NAME} = The name of your service","title":"Single variable"},{"location":"help/style-guide/documenting-code/#multiple-variables","text":"Correct Incorrect kn create service <service-name> --revision-name <revision-name> Where: <service-name> is the name of your Knative Service. <revision-name> is the desired name of your revision. kn create service <service-name> --revision-name <revision-name> Where <service-name> is the name of your Knative Service. Where <revision-name> is the desired name of your revision.","title":"Multiple variables"},{"location":"help/style-guide/documenting-code/#cli-output","text":"CLI Output should include the custom css \"{ .bash .no-copy }\" in place of \"bash\" which removes the \"Copy to clipboard button\" on the right side of the code block Correct Incorrect Correct Formatting Incorrect Formatting <some-code> <some-code> ```{ .bash .no-copy } <some-code> ``` ```bash <some-code> ```","title":"CLI output"},{"location":"help/style-guide/style-and-formatting/","text":"Formatting standards and conventions \u00b6 Titles and headings \u00b6 Use sentence case for titles and headings \u00b6 Only capitalize proper nouns, acronyms, and the first word of the heading. Correct Incorrect ## Configure the feature ## Configure the Feature ### Using feature ### Using Feature ### Using HTTPS ### Using https Do not use code formatting inside headings \u00b6 Correct Incorrect ## Configure the class annotation ## Configure the `class` annotation Use imperatives for headings of procedures \u00b6 For consistency, brevity, and to better signpost where action is expected of the reader, make procedure headings imperatives. Correct Incorrect ## Install KServe ## Installation of KServe ### Configure DNS ### Configuring DNS ## Verify the installation ## How to verify the installation Links \u00b6 Describe what the link targets \u00b6 Correct Incorrect For an explanation of what makes a good hyperlink, see this this article . See this article here . Write links in Markdown, not HTML \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) <a href=\"../kafka-broker/README.md\">Kafka Broker</a> [Kafka Broker](../kafka-broker/README.md){target=_blank} <a href=\"../kafka-broker/README.md\" target=\"_blank\">Kafka Broker</a> Include the .md extension in internal links \u00b6 Correct Incorrect [Setting up a custom domain](../serving/using-a-custom-domain.md) [Setting up a custom domain](../serving/using-a-custom-domain) Link to files, not folders \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/) Ensure the letter case is correct \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/readme.md) Formatting \u00b6 Use nonbreaking spaces in units of measurement other than percent \u00b6 For most units of measurement, when you specify a number with the unit, use a nonbreaking space between the number and the unit. Don't use spacing when the unit of measurement is percent. Correct Incorrect 3 &nbsp GB 3 GB 4 &nbsp CPUs 4 CPUs 14% 14 &nbsp % Use bold for user interface elements \u00b6 Correct Incorrect Click Fork Click \"Fork\" Select Other Select \"Other\" Use tables for definition lists \u00b6 When listing terms and their definitions, use table formatting instead of definition list formatting. Correct Incorrect |Value |Description | |------|---------------------| |Value1|Description of Value1| |Value2|Description of Value2| Value1 : Description of Value1 Value2 : Description of Value2 General style \u00b6 Use upper camel case for KServe API objects \u00b6 Correct Incorrect Explainers explainers Transformer transformer InferenceService Inference Service Only use parentheses for acronym explanations \u00b6 Put an acronym inside parentheses after its explanation. Don\u2019t use parentheses for anything else. Parenthetical statements especially should be avoided because readers skip them. If something is important enough to be in the sentence, it should be fully part of that sentence. Correct Incorrect Custom Resource Definition (CRD) Check your CLI (you should see it there) Knative Serving creates a Revision Knative creates a Revision (a stateless, snapshot in time of your code and configuration) Use the international standard for punctuation inside quotes \u00b6 Correct Incorrect Events are recorded with an associated \"stage\". Events are recorded with an associated \"stage.\" The copy is called a \"fork\". The copy is called a \"fork.\"","title":"Formatting standards and conventions"},{"location":"help/style-guide/style-and-formatting/#formatting-standards-and-conventions","text":"","title":"Formatting standards and conventions"},{"location":"help/style-guide/style-and-formatting/#titles-and-headings","text":"","title":"Titles and headings"},{"location":"help/style-guide/style-and-formatting/#use-sentence-case-for-titles-and-headings","text":"Only capitalize proper nouns, acronyms, and the first word of the heading. Correct Incorrect ## Configure the feature ## Configure the Feature ### Using feature ### Using Feature ### Using HTTPS ### Using https","title":"Use sentence case for titles and headings"},{"location":"help/style-guide/style-and-formatting/#do-not-use-code-formatting-inside-headings","text":"Correct Incorrect ## Configure the class annotation ## Configure the `class` annotation","title":"Do not use code formatting inside headings"},{"location":"help/style-guide/style-and-formatting/#use-imperatives-for-headings-of-procedures","text":"For consistency, brevity, and to better signpost where action is expected of the reader, make procedure headings imperatives. Correct Incorrect ## Install KServe ## Installation of KServe ### Configure DNS ### Configuring DNS ## Verify the installation ## How to verify the installation","title":"Use imperatives for headings of procedures"},{"location":"help/style-guide/style-and-formatting/#links","text":"","title":"Links"},{"location":"help/style-guide/style-and-formatting/#describe-what-the-link-targets","text":"Correct Incorrect For an explanation of what makes a good hyperlink, see this this article . See this article here .","title":"Describe what the link targets"},{"location":"help/style-guide/style-and-formatting/#write-links-in-markdown-not-html","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) <a href=\"../kafka-broker/README.md\">Kafka Broker</a> [Kafka Broker](../kafka-broker/README.md){target=_blank} <a href=\"../kafka-broker/README.md\" target=\"_blank\">Kafka Broker</a>","title":"Write links in Markdown, not HTML"},{"location":"help/style-guide/style-and-formatting/#include-the-md-extension-in-internal-links","text":"Correct Incorrect [Setting up a custom domain](../serving/using-a-custom-domain.md) [Setting up a custom domain](../serving/using-a-custom-domain)","title":"Include the .md extension in internal links"},{"location":"help/style-guide/style-and-formatting/#link-to-files-not-folders","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/)","title":"Link to files, not folders"},{"location":"help/style-guide/style-and-formatting/#ensure-the-letter-case-is-correct","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/readme.md)","title":"Ensure the letter case is correct"},{"location":"help/style-guide/style-and-formatting/#formatting","text":"","title":"Formatting"},{"location":"help/style-guide/style-and-formatting/#use-nonbreaking-spaces-in-units-of-measurement-other-than-percent","text":"For most units of measurement, when you specify a number with the unit, use a nonbreaking space between the number and the unit. Don't use spacing when the unit of measurement is percent. Correct Incorrect 3 &nbsp GB 3 GB 4 &nbsp CPUs 4 CPUs 14% 14 &nbsp %","title":"Use nonbreaking spaces in units of measurement other than percent"},{"location":"help/style-guide/style-and-formatting/#use-bold-for-user-interface-elements","text":"Correct Incorrect Click Fork Click \"Fork\" Select Other Select \"Other\"","title":"Use bold for user interface elements"},{"location":"help/style-guide/style-and-formatting/#use-tables-for-definition-lists","text":"When listing terms and their definitions, use table formatting instead of definition list formatting. Correct Incorrect |Value |Description | |------|---------------------| |Value1|Description of Value1| |Value2|Description of Value2| Value1 : Description of Value1 Value2 : Description of Value2","title":"Use tables for definition lists"},{"location":"help/style-guide/style-and-formatting/#general-style","text":"","title":"General style"},{"location":"help/style-guide/style-and-formatting/#use-upper-camel-case-for-kserve-api-objects","text":"Correct Incorrect Explainers explainers Transformer transformer InferenceService Inference Service","title":"Use upper camel case for KServe API objects"},{"location":"help/style-guide/style-and-formatting/#only-use-parentheses-for-acronym-explanations","text":"Put an acronym inside parentheses after its explanation. Don\u2019t use parentheses for anything else. Parenthetical statements especially should be avoided because readers skip them. If something is important enough to be in the sentence, it should be fully part of that sentence. Correct Incorrect Custom Resource Definition (CRD) Check your CLI (you should see it there) Knative Serving creates a Revision Knative creates a Revision (a stateless, snapshot in time of your code and configuration)","title":"Only use parentheses for acronym explanations"},{"location":"help/style-guide/style-and-formatting/#use-the-international-standard-for-punctuation-inside-quotes","text":"Correct Incorrect Events are recorded with an associated \"stage\". Events are recorded with an associated \"stage.\" The copy is called a \"fork\". The copy is called a \"fork.\"","title":"Use the international standard for punctuation inside quotes"},{"location":"help/style-guide/voice-and-language/","text":"Voice and language \u00b6 Use present tense \u00b6 Correct Incorrect This command starts a proxy. This command will start a proxy. Use active voice \u00b6 Correct Incorrect You can explore the API using a browser. The API can be explored using a browser. The YAML file specifies the replica count. The replica count is specified in the YAML file. Use simple and direct language \u00b6 Use simple and direct language. Avoid using unnecessary words, such as \"please\". Correct Incorrect To create a ReplicaSet , ... In order to create a ReplicaSet , ... See the configuration file. Please see the configuration file. View the Pods. With this next command, we'll view the Pods. Address the reader as \"you\", not \"we\" \u00b6 Correct Incorrect You can create a Deployment by ... We can create a Deployment by ... In the preceding output, you can see... In the preceding output, we can see ... This page teaches you how to use pods. In this page, we are going to learn about pods. Avoid jargon, idioms, and Latin \u00b6 Some readers speak English as a second language. Avoid jargon, idioms, and Latin to help make their understanding easier. Correct Incorrect Internally, ... Under the hood, ... Create a new cluster. Turn up a new cluster. Initially, ... Out of the box, ... For example, ... e.g., ... Enter through the gateway ... Enter via the gateway ... Avoid statements about the future \u00b6 Avoid making promises or giving hints about the future. If you need to talk about a feature in development, add a boilerplate under the front matter that identifies the information accordingly. Avoid statements that will soon be out of date \u00b6 Avoid using wording that becomes outdated quickly like \"currently\" and \"new\". A feature that is new today is not new for long. Correct Incorrect In version 1.4, ... In the current version, ... The Federation feature provides ... The new Federation feature provides ... Avoid words that assume a specific level of understanding \u00b6 Avoid words such as \"just\", \"simply\", \"easy\", \"easily\", or \"simple\". These words do not add value. Correct Incorrect Include one command in ... Include just one command in ... Run the container ... Simply run the container ... You can remove ... You can easily remove ... These steps ... These simple steps ...","title":"Voice and language"},{"location":"help/style-guide/voice-and-language/#voice-and-language","text":"","title":"Voice and language"},{"location":"help/style-guide/voice-and-language/#use-present-tense","text":"Correct Incorrect This command starts a proxy. This command will start a proxy.","title":"Use present tense"},{"location":"help/style-guide/voice-and-language/#use-active-voice","text":"Correct Incorrect You can explore the API using a browser. The API can be explored using a browser. The YAML file specifies the replica count. The replica count is specified in the YAML file.","title":"Use active voice"},{"location":"help/style-guide/voice-and-language/#use-simple-and-direct-language","text":"Use simple and direct language. Avoid using unnecessary words, such as \"please\". Correct Incorrect To create a ReplicaSet , ... In order to create a ReplicaSet , ... See the configuration file. Please see the configuration file. View the Pods. With this next command, we'll view the Pods.","title":"Use simple and direct language"},{"location":"help/style-guide/voice-and-language/#address-the-reader-as-you-not-we","text":"Correct Incorrect You can create a Deployment by ... We can create a Deployment by ... In the preceding output, you can see... In the preceding output, we can see ... This page teaches you how to use pods. In this page, we are going to learn about pods.","title":"Address the reader as \"you\", not \"we\""},{"location":"help/style-guide/voice-and-language/#avoid-jargon-idioms-and-latin","text":"Some readers speak English as a second language. Avoid jargon, idioms, and Latin to help make their understanding easier. Correct Incorrect Internally, ... Under the hood, ... Create a new cluster. Turn up a new cluster. Initially, ... Out of the box, ... For example, ... e.g., ... Enter through the gateway ... Enter via the gateway ...","title":"Avoid jargon, idioms, and Latin"},{"location":"help/style-guide/voice-and-language/#avoid-statements-about-the-future","text":"Avoid making promises or giving hints about the future. If you need to talk about a feature in development, add a boilerplate under the front matter that identifies the information accordingly.","title":"Avoid statements about the future"},{"location":"help/style-guide/voice-and-language/#avoid-statements-that-will-soon-be-out-of-date","text":"Avoid using wording that becomes outdated quickly like \"currently\" and \"new\". A feature that is new today is not new for long. Correct Incorrect In version 1.4, ... In the current version, ... The Federation feature provides ... The new Federation feature provides ...","title":"Avoid statements that will soon be out of date"},{"location":"help/style-guide/voice-and-language/#avoid-words-that-assume-a-specific-level-of-understanding","text":"Avoid words such as \"just\", \"simply\", \"easy\", \"easily\", or \"simple\". These words do not add value. Correct Incorrect Include one command in ... Include just one command in ... Run the container ... Simply run the container ... You can remove ... You can easily remove ... These steps ... These simple steps ...","title":"Avoid words that assume a specific level of understanding"},{"location":"modelserving/control_plane/","text":"Control Plane \u00b6 KServe Control Plane : Responsible for reconciling the InferenceService custom resources. It creates the Knative serverless deployment for predictor, transformer, explainer to enable autoscaling based on incoming request workload including scaling down to zero when no traffic is received. When raw deployment mode is enabled, control plane creates Kubernetes deployment, service, ingress, HPA. Control Plane Components \u00b6 KServe Controller : Responsible for creating service, ingress resources, model server container and model agent container for request/response logging , batching and model pulling. Ingress Gateway : Gateway for routing external or internal requests. In Serverless Mode: Knative Serving Controller : Responsible for service revision management, creating network routing resources, serverless container with queue proxy to expose traffic metrics and enforce concurrency limit. Knative Activator : Brings back scaled-to-zero pods and forwards requests. Knative Autoscaler(KPA) : Watches traffic flow to the application, and scales replicas up or down based on configured metrics.","title":"Model Serving Control Plane"},{"location":"modelserving/control_plane/#control-plane","text":"KServe Control Plane : Responsible for reconciling the InferenceService custom resources. It creates the Knative serverless deployment for predictor, transformer, explainer to enable autoscaling based on incoming request workload including scaling down to zero when no traffic is received. When raw deployment mode is enabled, control plane creates Kubernetes deployment, service, ingress, HPA.","title":"Control Plane"},{"location":"modelserving/control_plane/#control-plane-components","text":"KServe Controller : Responsible for creating service, ingress resources, model server container and model agent container for request/response logging , batching and model pulling. Ingress Gateway : Gateway for routing external or internal requests. In Serverless Mode: Knative Serving Controller : Responsible for service revision management, creating network routing resources, serverless container with queue proxy to expose traffic metrics and enforce concurrency limit. Knative Activator : Brings back scaled-to-zero pods and forwards requests. Knative Autoscaler(KPA) : Watches traffic flow to the application, and scales replicas up or down based on configured metrics.","title":"Control Plane Components"},{"location":"modelserving/servingruntimes/","text":"Macro Syntax Error \u00b6 File : modelserving/servingruntimes.md Line 83 in Markdown file: unexpected '.' > **Note:** `ServingRuntimes` support the use of template variables of the form `{{.Variable}}` inside the container spec. These should map to fields inside an","title":"Serving Runtimes"},{"location":"modelserving/servingruntimes/#macro-syntax-error","text":"File : modelserving/servingruntimes.md Line 83 in Markdown file: unexpected '.' > **Note:** `ServingRuntimes` support the use of template variables of the form `{{.Variable}}` inside the container spec. These should map to fields inside an","title":"Macro Syntax Error"},{"location":"modelserving/autoscaling/autoscaling/","text":"Autoscale InferenceService with inference workload \u00b6 InferenceService with target concurrency \u00b6 Create InferenceService \u00b6 Apply the tensorflow example CR with scaling target set to 1. Annotation autoscaling.knative.dev/target is the soft limit rather than a strictly enforced limit, if there is sudden burst of the requests, this value can be exceeded. The scaleTarget and scaleMetric are introduced in version 0.9 of kserve and should be available in both new and old schema. This is the preferred way of defining autoscaling options. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale.yaml to create the Autoscale InferenceService. kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created Predict InferenceService with concurrent requests \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send traffic in 30 seconds spurts maintaining 5 in-flight requests. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0193 secs Slowest: 10 .1458 secs Fastest: 0 .0127 secs Average: 0 .0364 secs Requests/sec: 137 .4449 Total data: 1019122 bytes Size/request: 247 bytes Response time histogram: 0 .013 [ 1 ] | 1 .026 [ 4120 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .039 [ 0 ] | 3 .053 [ 0 ] | 4 .066 [ 0 ] | 5 .079 [ 0 ] | 6 .093 [ 0 ] | 7 .106 [ 0 ] | 8 .119 [ 0 ] | 9 .133 [ 0 ] | 10 .146 [ 5 ] | Latency distribution: 10 % in 0 .0178 secs 25 % in 0 .0188 secs 50 % in 0 .0199 secs 75 % in 0 .0210 secs 90 % in 0 .0231 secs 95 % in 0 .0328 secs 99 % in 0 .1501 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0002 secs, 0 .0127 secs, 10 .1458 secs DNS-lookup: 0 .0002 secs, 0 .0000 secs, 0 .1502 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0020 secs resp wait: 0 .0360 secs, 0 .0125 secs, 9 .9791 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4126 responses Check the number of running pods now, Kserve uses Knative Serving autoscaler which is based on the average number of in-flight requests per pod(concurrency). As the scaling target is set to 1 and we load the service with 5 concurrent requests, so the autoscaler tries scaling up to 5 pods. Notice that out of all the requests there are 5 requests on the histogram that take around 10s, that's the cold start time cost to initially spawn the pods and download model to be ready to serve. The cold start may take longer(to pull the serving image) if the image is not cached on the node that the pod is scheduled on. $ kubectl get pods NAME READY STATUS RESTARTS AGE flowers-sample-default-7kqt6-deployment-75d577dcdb-sr5wd 3 /3 Running 0 42s flowers-sample-default-7kqt6-deployment-75d577dcdb-swnk5 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-t2njf 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-vdlp9 3 /3 Running 0 64s flowers-sample-default-7kqt6-deployment-75d577dcdb-vm58d 3 /3 Running 0 42s Check Dashboard \u00b6 View the Knative Serving Scaling dashboards (if configured). kubectl kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 InferenceService with target QPS \u00b6 Create the InferenceService \u00b6 Apply the same tensorflow example CR kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created Predict InferenceService with target QPS \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 50 qps. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -q 50 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0264 secs Slowest: 10 .8113 secs Fastest: 0 .0145 secs Average: 0 .0731 secs Requests/sec: 683 .5644 Total data: 5069675 bytes Size/request: 247 bytes Response time histogram: 0 .014 [ 1 ] | 1 .094 [ 20474 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .174 [ 0 ] | 3 .254 [ 0 ] | 4 .333 [ 0 ] | 5 .413 [ 0 ] | 6 .493 [ 0 ] | 7 .572 [ 0 ] | 8 .652 [ 0 ] | 9 .732 [ 0 ] | 10 .811 [ 50 ] | Latency distribution: 10 % in 0 .0284 secs 25 % in 0 .0334 secs 50 % in 0 .0408 secs 75 % in 0 .0527 secs 90 % in 0 .0765 secs 95 % in 0 .0949 secs 99 % in 0 .1334 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0001 secs, 0 .0145 secs, 10 .8113 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0196 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs resp wait: 0 .0728 secs, 0 .0144 secs, 10 .7688 secs resp read: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs Status code distribution: [ 200 ] 20525 responses Check the number of running pods now, we are loading the service with 50 requests per second, and from the dashboard you can see that it hits the average concurrency 10 and autoscaler tries scaling up to 10 pods. Check Dashboard \u00b6 View the Knative Serving Scaling dashboards (if configured). kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 Autoscaler calculates average concurrency over 60 second window so it takes a minute to stabilize at the desired concurrency level, however it also calculates the 6 second panic window and will enter into panic mode if that window reaches 2x target concurrency. From the dashboard you can see that it enters panic mode in which autoscaler operates on shorter and more sensitive window. Once the panic conditions are no longer met for 60 seconds, autoscaler will return back to 60 seconds stable window. Autoscaling on GPU! \u00b6 Autoscaling on GPU is hard with GPU metrics, however thanks to Knative's concurrency based autoscaler scaling on GPU is pretty easy and effective! Create the InferenceService with GPU resource \u00b6 Apply the tensorflow gpu example CR New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 Apply the autoscale-gpu.yaml . kubectl kubectl apply -f autoscale-gpu.yaml Predict InferenceService with concurrent requests \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 5 in-flight requests. MODEL_NAME = flowers-sample-gpu INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0152 secs Slowest: 9 .7581 secs Fastest: 0 .0142 secs Average: 0 .0350 secs Requests/sec: 142 .9942 Total data: 948532 bytes Size/request: 221 bytes Response time histogram: 0 .014 [ 1 ] | 0 .989 [ 4286 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 1 .963 [ 0 ] | 2 .937 [ 0 ] | 3 .912 [ 0 ] | 4 .886 [ 0 ] | 5 .861 [ 0 ] | 6 .835 [ 0 ] | 7 .809 [ 0 ] | 8 .784 [ 0 ] | 9 .758 [ 5 ] | Latency distribution: 10 % in 0 .0181 secs 25 % in 0 .0189 secs 50 % in 0 .0198 secs 75 % in 0 .0210 secs 90 % in 0 .0230 secs 95 % in 0 .0276 secs 99 % in 0 .0511 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0000 secs, 0 .0142 secs, 9 .7581 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0291 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0023 secs resp wait: 0 .0348 secs, 0 .0141 secs, 9 .7158 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4292 responses Autoscaling Customization \u00b6 Autoscaling with ContainerConcurrency \u00b6 ContainerConcurrency determines the number of simultaneous requests that can be processed by each replica of the InferenceService at any given time, it is a hard limit and if the concurrency reaches the hard limit surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale-custom.yaml . kubectl kubectl apply -f autoscale-custom.yaml Enable scale down to zero \u00b6 KServe by default sets minReplicas to 1, if you want to enable scaling down to zero especially for use cases like serving on GPUs you can set minReplicas to 0 so that the pods automatically scale down to zero when no traffic is received. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the scale-down-to-zero.yaml . kubectl kubectl apply -f scale-down-to-zero.yaml Autoscaling configuration at component level \u00b6 Autoscaling options can also be configured at the component level. This allows more flexibility in terms of the autoscaling configuration. In a typical deployment, transformers may require a different autoscaling configuration than a predictor. This feature allows the user to scale individual components as required. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Apply the autoscale-adv.yaml to create the Autoscale InferenceService. The default for scaleMetric is concurrency and possible values are concurrency , rps , cpu and memory .","title":"Inference Autoscaling"},{"location":"modelserving/autoscaling/autoscaling/#autoscale-inferenceservice-with-inference-workload","text":"","title":"Autoscale InferenceService with inference workload"},{"location":"modelserving/autoscaling/autoscaling/#inferenceservice-with-target-concurrency","text":"","title":"InferenceService with target concurrency"},{"location":"modelserving/autoscaling/autoscaling/#create-inferenceservice","text":"Apply the tensorflow example CR with scaling target set to 1. Annotation autoscaling.knative.dev/target is the soft limit rather than a strictly enforced limit, if there is sudden burst of the requests, this value can be exceeded. The scaleTarget and scaleMetric are introduced in version 0.9 of kserve and should be available in both new and old schema. This is the preferred way of defining autoscaling options. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale.yaml to create the Autoscale InferenceService. kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created","title":"Create InferenceService"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-concurrent-requests","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send traffic in 30 seconds spurts maintaining 5 in-flight requests. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0193 secs Slowest: 10 .1458 secs Fastest: 0 .0127 secs Average: 0 .0364 secs Requests/sec: 137 .4449 Total data: 1019122 bytes Size/request: 247 bytes Response time histogram: 0 .013 [ 1 ] | 1 .026 [ 4120 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .039 [ 0 ] | 3 .053 [ 0 ] | 4 .066 [ 0 ] | 5 .079 [ 0 ] | 6 .093 [ 0 ] | 7 .106 [ 0 ] | 8 .119 [ 0 ] | 9 .133 [ 0 ] | 10 .146 [ 5 ] | Latency distribution: 10 % in 0 .0178 secs 25 % in 0 .0188 secs 50 % in 0 .0199 secs 75 % in 0 .0210 secs 90 % in 0 .0231 secs 95 % in 0 .0328 secs 99 % in 0 .1501 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0002 secs, 0 .0127 secs, 10 .1458 secs DNS-lookup: 0 .0002 secs, 0 .0000 secs, 0 .1502 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0020 secs resp wait: 0 .0360 secs, 0 .0125 secs, 9 .9791 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4126 responses Check the number of running pods now, Kserve uses Knative Serving autoscaler which is based on the average number of in-flight requests per pod(concurrency). As the scaling target is set to 1 and we load the service with 5 concurrent requests, so the autoscaler tries scaling up to 5 pods. Notice that out of all the requests there are 5 requests on the histogram that take around 10s, that's the cold start time cost to initially spawn the pods and download model to be ready to serve. The cold start may take longer(to pull the serving image) if the image is not cached on the node that the pod is scheduled on. $ kubectl get pods NAME READY STATUS RESTARTS AGE flowers-sample-default-7kqt6-deployment-75d577dcdb-sr5wd 3 /3 Running 0 42s flowers-sample-default-7kqt6-deployment-75d577dcdb-swnk5 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-t2njf 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-vdlp9 3 /3 Running 0 64s flowers-sample-default-7kqt6-deployment-75d577dcdb-vm58d 3 /3 Running 0 42s","title":"Predict InferenceService with concurrent requests"},{"location":"modelserving/autoscaling/autoscaling/#check-dashboard","text":"View the Knative Serving Scaling dashboards (if configured). kubectl kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000","title":"Check Dashboard"},{"location":"modelserving/autoscaling/autoscaling/#inferenceservice-with-target-qps","text":"","title":"InferenceService with target QPS"},{"location":"modelserving/autoscaling/autoscaling/#create-the-inferenceservice","text":"Apply the same tensorflow example CR kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created","title":"Create the InferenceService"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-target-qps","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 50 qps. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -q 50 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0264 secs Slowest: 10 .8113 secs Fastest: 0 .0145 secs Average: 0 .0731 secs Requests/sec: 683 .5644 Total data: 5069675 bytes Size/request: 247 bytes Response time histogram: 0 .014 [ 1 ] | 1 .094 [ 20474 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .174 [ 0 ] | 3 .254 [ 0 ] | 4 .333 [ 0 ] | 5 .413 [ 0 ] | 6 .493 [ 0 ] | 7 .572 [ 0 ] | 8 .652 [ 0 ] | 9 .732 [ 0 ] | 10 .811 [ 50 ] | Latency distribution: 10 % in 0 .0284 secs 25 % in 0 .0334 secs 50 % in 0 .0408 secs 75 % in 0 .0527 secs 90 % in 0 .0765 secs 95 % in 0 .0949 secs 99 % in 0 .1334 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0001 secs, 0 .0145 secs, 10 .8113 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0196 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs resp wait: 0 .0728 secs, 0 .0144 secs, 10 .7688 secs resp read: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs Status code distribution: [ 200 ] 20525 responses Check the number of running pods now, we are loading the service with 50 requests per second, and from the dashboard you can see that it hits the average concurrency 10 and autoscaler tries scaling up to 10 pods.","title":"Predict InferenceService with target QPS"},{"location":"modelserving/autoscaling/autoscaling/#check-dashboard_1","text":"View the Knative Serving Scaling dashboards (if configured). kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 Autoscaler calculates average concurrency over 60 second window so it takes a minute to stabilize at the desired concurrency level, however it also calculates the 6 second panic window and will enter into panic mode if that window reaches 2x target concurrency. From the dashboard you can see that it enters panic mode in which autoscaler operates on shorter and more sensitive window. Once the panic conditions are no longer met for 60 seconds, autoscaler will return back to 60 seconds stable window.","title":"Check Dashboard"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-on-gpu","text":"Autoscaling on GPU is hard with GPU metrics, however thanks to Knative's concurrency based autoscaler scaling on GPU is pretty easy and effective!","title":"Autoscaling on GPU!"},{"location":"modelserving/autoscaling/autoscaling/#create-the-inferenceservice-with-gpu-resource","text":"Apply the tensorflow gpu example CR New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 Apply the autoscale-gpu.yaml . kubectl kubectl apply -f autoscale-gpu.yaml","title":"Create the InferenceService with GPU resource"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-concurrent-requests_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 5 in-flight requests. MODEL_NAME = flowers-sample-gpu INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0152 secs Slowest: 9 .7581 secs Fastest: 0 .0142 secs Average: 0 .0350 secs Requests/sec: 142 .9942 Total data: 948532 bytes Size/request: 221 bytes Response time histogram: 0 .014 [ 1 ] | 0 .989 [ 4286 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 1 .963 [ 0 ] | 2 .937 [ 0 ] | 3 .912 [ 0 ] | 4 .886 [ 0 ] | 5 .861 [ 0 ] | 6 .835 [ 0 ] | 7 .809 [ 0 ] | 8 .784 [ 0 ] | 9 .758 [ 5 ] | Latency distribution: 10 % in 0 .0181 secs 25 % in 0 .0189 secs 50 % in 0 .0198 secs 75 % in 0 .0210 secs 90 % in 0 .0230 secs 95 % in 0 .0276 secs 99 % in 0 .0511 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0000 secs, 0 .0142 secs, 9 .7581 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0291 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0023 secs resp wait: 0 .0348 secs, 0 .0141 secs, 9 .7158 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4292 responses","title":"Predict InferenceService with concurrent requests"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-customization","text":"","title":"Autoscaling Customization"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-with-containerconcurrency","text":"ContainerConcurrency determines the number of simultaneous requests that can be processed by each replica of the InferenceService at any given time, it is a hard limit and if the concurrency reaches the hard limit surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale-custom.yaml . kubectl kubectl apply -f autoscale-custom.yaml","title":"Autoscaling with ContainerConcurrency"},{"location":"modelserving/autoscaling/autoscaling/#enable-scale-down-to-zero","text":"KServe by default sets minReplicas to 1, if you want to enable scaling down to zero especially for use cases like serving on GPUs you can set minReplicas to 0 so that the pods automatically scale down to zero when no traffic is received. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the scale-down-to-zero.yaml . kubectl kubectl apply -f scale-down-to-zero.yaml","title":"Enable scale down to zero"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-configuration-at-component-level","text":"Autoscaling options can also be configured at the component level. This allows more flexibility in terms of the autoscaling configuration. In a typical deployment, transformers may require a different autoscaling configuration than a predictor. This feature allows the user to scale individual components as required. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Apply the autoscale-adv.yaml to create the Autoscale InferenceService. The default for scaleMetric is concurrency and possible values are concurrency , rps , cpu and memory .","title":"Autoscaling configuration at component level"},{"location":"modelserving/batcher/batcher/","text":"Inference Batcher \u00b6 This docs explains on how batch prediction for any ML frameworks (TensorFlow, PyTorch, ...) without decreasing the performance. This batcher is implemented in the KServe model agent sidecar, so the requests first hit the agent sidecar, when a batch prediction is triggered the request is then sent to the model server container for inference. We use webhook to inject the model agent container in the InferenceService pod to do the batching when batcher is enabled. We use go channels to transfer data between http request handler and batcher go routines. Currently we only implemented batching with KServe v1 HTTP protocol, gRPC is not supported yet. When the number of instances (For example, the number of pictures) reaches the maxBatchSize or the latency meets the maxLatency , a batch prediction will be triggered. Example \u00b6 We first create a pytorch predictor with a batcher. The maxLatency is set to a big value (500 milliseconds) to make us be able to observe the batching process. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 maxBatchSize : the max batch size for triggering a prediction. maxLatency : the max latency for triggering a prediction (In milliseconds). timeout : timeout of calling predictor service (In seconds). All of the bellowing fields have default values in the code. You can config them or not as you wish. maxBatchSize : 32. maxLatency : 500. timeout : 60. kubectl kubectl create -f pytorch-batcher.yaml We can now send requests to the pytorch model using hey. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 10s -c 5 -m POST -host \" ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -D ./input.json \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict\" The request will go to the model agent container first, the batcher in sidecar container batches the requests and send the inference request to the predictor container. Note If the interval of sending the two requests is less than maxLatency , the returned batchId will be the same. Expected Output Summary: Total: 10 .5361 secs Slowest: 0 .5759 secs Fastest: 0 .4983 secs Average: 0 .5265 secs Requests/sec: 9 .4912 Total data: 24100 bytes Size/request: 241 bytes Response time histogram: 0 .498 [ 1 ] | \u25a0 0 .506 [ 0 ] | 0 .514 [ 44 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .522 [ 21 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .529 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .537 [ 5 ] | \u25a0\u25a0\u25a0\u25a0\u25a0 0 .545 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .553 [ 0 ] | 0 .560 [ 7 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .568 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .576 [ 10 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 Latency distribution: 10 % in 0 .5100 secs 25 % in 0 .5118 secs 50 % in 0 .5149 secs 75 % in 0 .5406 secs 90 % in 0 .5706 secs 95 % in 0 .5733 secs 99 % in 0 .5759 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0004 secs, 0 .4983 secs, 0 .5759 secs DNS-lookup: 0 .0001 secs, 0 .0000 secs, 0 .0015 secs req write: 0 .0002 secs, 0 .0000 secs, 0 .0076 secs resp wait: 0 .5257 secs, 0 .4981 secs, 0 .5749 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0009 secs Status code distribution: [ 200 ] 100 responses","title":"Inference Batcher"},{"location":"modelserving/batcher/batcher/#inference-batcher","text":"This docs explains on how batch prediction for any ML frameworks (TensorFlow, PyTorch, ...) without decreasing the performance. This batcher is implemented in the KServe model agent sidecar, so the requests first hit the agent sidecar, when a batch prediction is triggered the request is then sent to the model server container for inference. We use webhook to inject the model agent container in the InferenceService pod to do the batching when batcher is enabled. We use go channels to transfer data between http request handler and batcher go routines. Currently we only implemented batching with KServe v1 HTTP protocol, gRPC is not supported yet. When the number of instances (For example, the number of pictures) reaches the maxBatchSize or the latency meets the maxLatency , a batch prediction will be triggered.","title":"Inference Batcher"},{"location":"modelserving/batcher/batcher/#example","text":"We first create a pytorch predictor with a batcher. The maxLatency is set to a big value (500 milliseconds) to make us be able to observe the batching process. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 maxBatchSize : the max batch size for triggering a prediction. maxLatency : the max latency for triggering a prediction (In milliseconds). timeout : timeout of calling predictor service (In seconds). All of the bellowing fields have default values in the code. You can config them or not as you wish. maxBatchSize : 32. maxLatency : 500. timeout : 60. kubectl kubectl create -f pytorch-batcher.yaml We can now send requests to the pytorch model using hey. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 10s -c 5 -m POST -host \" ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -D ./input.json \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict\" The request will go to the model agent container first, the batcher in sidecar container batches the requests and send the inference request to the predictor container. Note If the interval of sending the two requests is less than maxLatency , the returned batchId will be the same. Expected Output Summary: Total: 10 .5361 secs Slowest: 0 .5759 secs Fastest: 0 .4983 secs Average: 0 .5265 secs Requests/sec: 9 .4912 Total data: 24100 bytes Size/request: 241 bytes Response time histogram: 0 .498 [ 1 ] | \u25a0 0 .506 [ 0 ] | 0 .514 [ 44 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .522 [ 21 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .529 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .537 [ 5 ] | \u25a0\u25a0\u25a0\u25a0\u25a0 0 .545 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .553 [ 0 ] | 0 .560 [ 7 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .568 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .576 [ 10 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 Latency distribution: 10 % in 0 .5100 secs 25 % in 0 .5118 secs 50 % in 0 .5149 secs 75 % in 0 .5406 secs 90 % in 0 .5706 secs 95 % in 0 .5733 secs 99 % in 0 .5759 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0004 secs, 0 .4983 secs, 0 .5759 secs DNS-lookup: 0 .0001 secs, 0 .0000 secs, 0 .0015 secs req write: 0 .0002 secs, 0 .0000 secs, 0 .0076 secs resp wait: 0 .5257 secs, 0 .4981 secs, 0 .5749 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0009 secs Status code distribution: [ 200 ] 100 responses","title":"Example"},{"location":"modelserving/certificate/kserve/","text":"KServe with Self Signed Certificate Model Registry \u00b6 If you are using a model registry with a self-signed certificate, you must either skip ssl verify or apply the appropriate CA bundle to the storage-initializer to create a connection with the registry. This document explains three methods that can be used in KServe, described below: Configure CA bundle for storage-initializer Global configuration Namespace scope configuration(Using storage-config Secret) json annotation Skip SSL Verification (NOTE) This is only available for RawDeployment and ServerlessDeployment . For modelmesh, you should add ca bundle content into certificate parameter in storage-config Configure CA bundle for storage-initializer \u00b6 Global Configuration \u00b6 KServe use inferenceservice-config ConfigMap for default configuration. If you want to add cabundle cert for every inference service, you can set caBundleConfigMapName in the ConfigMap. Before updating the ConfigMap, you have to create a ConfigMap for CA bundle certificate in the namespace that KServe controller is running and the data key in the ConfigMap must be cabundle.crt . Create CA ConfigMap with the CA bundle cert kubectl create configmap cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: cabundle namespace: kserve Update inferenceservice-config ConfigMap storageInitializer: |- { ... \"caBundleConfigMapName\": \"cabundle\", ... } After you update this configuration, please restart KServe controller pod to pick up the change. When you create a inference service, then the ca bundle will be copied to your user namespace and it will be attached to the storage-initializer container. Using storage-config Secret \u00b6 If you want to apply the cabundle only to a specific inferenceservice, you can use a specific annotation or variable( cabundle_configmap ) on the storage-config Secret used by the inferenceservice. In this case, you have to create the cabundle ConfigMap in the user namespace before you create the inferenceservice. Create a ConfigMap with the cabundle cert kubectl create configmap local-cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: local-cabundle namespace: kserve-demo Add an annotation serving.kserve.io/s3-cabundle-configmap to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-cabundle-configmap: local-cabundle ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable cabundle_configmap to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", .... \"cabundle_configmap\": \"local-cabundle\" } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Skip SSL Verification \u00b6 For testing purposes or when there is no cabundle, you can easily create an SSL connection by disabling SSL verification. This can also be used by adding an annotation or setting a variable in secret-config Secret. Add an annotation( serving.kserve.io/s3-verifyssl ) to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-verifyssl: \"0\" # 1 is true, 0 is false ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable ( verify_ssl ) to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", ... \"verify_ssl\": \"0\" # 1 is true, 0 is false (You can set True/true/False/false too) } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Full Demo Scripts","title":"CA Certificate"},{"location":"modelserving/certificate/kserve/#kserve-with-self-signed-certificate-model-registry","text":"If you are using a model registry with a self-signed certificate, you must either skip ssl verify or apply the appropriate CA bundle to the storage-initializer to create a connection with the registry. This document explains three methods that can be used in KServe, described below: Configure CA bundle for storage-initializer Global configuration Namespace scope configuration(Using storage-config Secret) json annotation Skip SSL Verification (NOTE) This is only available for RawDeployment and ServerlessDeployment . For modelmesh, you should add ca bundle content into certificate parameter in storage-config","title":"KServe with Self Signed Certificate Model Registry"},{"location":"modelserving/certificate/kserve/#configure-ca-bundle-for-storage-initializer","text":"","title":"Configure CA bundle for storage-initializer"},{"location":"modelserving/certificate/kserve/#global-configuration","text":"KServe use inferenceservice-config ConfigMap for default configuration. If you want to add cabundle cert for every inference service, you can set caBundleConfigMapName in the ConfigMap. Before updating the ConfigMap, you have to create a ConfigMap for CA bundle certificate in the namespace that KServe controller is running and the data key in the ConfigMap must be cabundle.crt . Create CA ConfigMap with the CA bundle cert kubectl create configmap cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: cabundle namespace: kserve Update inferenceservice-config ConfigMap storageInitializer: |- { ... \"caBundleConfigMapName\": \"cabundle\", ... } After you update this configuration, please restart KServe controller pod to pick up the change. When you create a inference service, then the ca bundle will be copied to your user namespace and it will be attached to the storage-initializer container.","title":"Global Configuration"},{"location":"modelserving/certificate/kserve/#using-storage-config-secret","text":"If you want to apply the cabundle only to a specific inferenceservice, you can use a specific annotation or variable( cabundle_configmap ) on the storage-config Secret used by the inferenceservice. In this case, you have to create the cabundle ConfigMap in the user namespace before you create the inferenceservice. Create a ConfigMap with the cabundle cert kubectl create configmap local-cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: local-cabundle namespace: kserve-demo Add an annotation serving.kserve.io/s3-cabundle-configmap to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-cabundle-configmap: local-cabundle ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable cabundle_configmap to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", .... \"cabundle_configmap\": \"local-cabundle\" } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque","title":"Using storage-config Secret"},{"location":"modelserving/certificate/kserve/#skip-ssl-verification","text":"For testing purposes or when there is no cabundle, you can easily create an SSL connection by disabling SSL verification. This can also be used by adding an annotation or setting a variable in secret-config Secret. Add an annotation( serving.kserve.io/s3-verifyssl ) to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-verifyssl: \"0\" # 1 is true, 0 is false ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable ( verify_ssl ) to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", ... \"verify_ssl\": \"0\" # 1 is true, 0 is false (You can set True/true/False/false too) } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Full Demo Scripts","title":"Skip SSL Verification"},{"location":"modelserving/data_plane/data_plane/","text":"Data Plane \u00b6 The InferenceService Data Plane architecture consists of a static graph of components which coordinate requests for a single model. Advanced features such as Ensembling, A/B testing, and Multi-Arm-Bandits should compose InferenceServices together. Introduction \u00b6 KServe's data plane protocol introduces an inference API that is independent of any specific ML/DL framework and model server. This allows for quick iterations and consistency across Inference Services and supports both easy-to-use and high-performance use cases. By implementing this protocol both inference clients and servers will increase their utility and portability by operating seamlessly on platforms that have standardized around this API. Kserve's inference protocol is endorsed by NVIDIA Triton Inference Server, TensorFlow Serving, and TorchServe. Note: Protocol V2 uses /infer instead of :predict Concepts \u00b6 Component : Each endpoint is composed of multiple components: \"predictor\", \"explainer\", and \"transformer\". The only required component is the predictor, which is the core of the system. As KServe evolves, we plan to increase the number of supported components to enable use cases like Outlier Detection. Predictor : The predictor is the workhorse of the InferenceService. It is simply a model and a model server that makes it available at a network endpoint. Explainer : The explainer enables an optional alternate data plane that provides model explanations in addition to predictions. Users may define their own explanation container, which configures with relevant environment variables like prediction endpoint. For common use cases, KServe provides out-of-the-box explainers like Alibi. Transformer : The transformer enables users to define a pre and post processing step before the prediction and explanation workflows. Like the explainer, it is configured with relevant environment variables too. For common use cases, KServe provides out-of-the-box transformers like Feast. Data Plane V1 & V2 \u00b6 KServe supports two versions of its data plane, V1 and V2. V1 protocol offers a standard prediction workflow with HTTP/REST. The second version of the data-plane protocol addresses several issues found with the V1 data-plane protocol, including performance and generality across a large number of model frameworks and servers. Protocol V2 expands the capabilities of V1 by adding gRPC APIs. Main changes \u00b6 V2 does not currently support the explain endpoint V2 added Server Readiness/Liveness/Metadata endpoints V2 endpoint paths contain / instead of : V2 renamed :predict endpoint to /infer V2 allows for model versions in the request path (optional) V1 APIs \u00b6 API Verb Path List Models GET /v1/models Model Ready GET /v1/models/<model_name> Predict POST /v1/models/<model_name>:predict Explain POST /v1/models/<model_name>:explain V2 APIs \u00b6 API Verb Path Inference POST v2/models/<model_name>[/versions/<model_version>]/infer Model Metadata GET v2/models/<model_name>[/versions/<model_version>] Server Readiness GET v2/health/ready Server Liveness GET v2/health/live Server Metadata GET v2 Model Readiness GET v2/models/<model_name>[/versions/ ]/ready ** path contents in [] are optional Please see V1 Protocol and V2 Protocol documentation for more information.","title":"Model Serving Data Plane"},{"location":"modelserving/data_plane/data_plane/#data-plane","text":"The InferenceService Data Plane architecture consists of a static graph of components which coordinate requests for a single model. Advanced features such as Ensembling, A/B testing, and Multi-Arm-Bandits should compose InferenceServices together.","title":"Data Plane"},{"location":"modelserving/data_plane/data_plane/#introduction","text":"KServe's data plane protocol introduces an inference API that is independent of any specific ML/DL framework and model server. This allows for quick iterations and consistency across Inference Services and supports both easy-to-use and high-performance use cases. By implementing this protocol both inference clients and servers will increase their utility and portability by operating seamlessly on platforms that have standardized around this API. Kserve's inference protocol is endorsed by NVIDIA Triton Inference Server, TensorFlow Serving, and TorchServe. Note: Protocol V2 uses /infer instead of :predict","title":"Introduction"},{"location":"modelserving/data_plane/data_plane/#concepts","text":"Component : Each endpoint is composed of multiple components: \"predictor\", \"explainer\", and \"transformer\". The only required component is the predictor, which is the core of the system. As KServe evolves, we plan to increase the number of supported components to enable use cases like Outlier Detection. Predictor : The predictor is the workhorse of the InferenceService. It is simply a model and a model server that makes it available at a network endpoint. Explainer : The explainer enables an optional alternate data plane that provides model explanations in addition to predictions. Users may define their own explanation container, which configures with relevant environment variables like prediction endpoint. For common use cases, KServe provides out-of-the-box explainers like Alibi. Transformer : The transformer enables users to define a pre and post processing step before the prediction and explanation workflows. Like the explainer, it is configured with relevant environment variables too. For common use cases, KServe provides out-of-the-box transformers like Feast.","title":"Concepts"},{"location":"modelserving/data_plane/data_plane/#data-plane-v1-v2","text":"KServe supports two versions of its data plane, V1 and V2. V1 protocol offers a standard prediction workflow with HTTP/REST. The second version of the data-plane protocol addresses several issues found with the V1 data-plane protocol, including performance and generality across a large number of model frameworks and servers. Protocol V2 expands the capabilities of V1 by adding gRPC APIs.","title":"Data Plane V1 &amp; V2"},{"location":"modelserving/data_plane/data_plane/#main-changes","text":"V2 does not currently support the explain endpoint V2 added Server Readiness/Liveness/Metadata endpoints V2 endpoint paths contain / instead of : V2 renamed :predict endpoint to /infer V2 allows for model versions in the request path (optional)","title":"Main changes"},{"location":"modelserving/data_plane/data_plane/#v1-apis","text":"API Verb Path List Models GET /v1/models Model Ready GET /v1/models/<model_name> Predict POST /v1/models/<model_name>:predict Explain POST /v1/models/<model_name>:explain","title":"V1 APIs"},{"location":"modelserving/data_plane/data_plane/#v2-apis","text":"API Verb Path Inference POST v2/models/<model_name>[/versions/<model_version>]/infer Model Metadata GET v2/models/<model_name>[/versions/<model_version>] Server Readiness GET v2/health/ready Server Liveness GET v2/health/live Server Metadata GET v2 Model Readiness GET v2/models/<model_name>[/versions/ ]/ready ** path contents in [] are optional Please see V1 Protocol and V2 Protocol documentation for more information.","title":"V2 APIs"},{"location":"modelserving/data_plane/v1_protocol/","text":"Data Plane (V1) \u00b6 KServe's V1 protocol offers a standardized prediction workflow across all model frameworks. This protocol version is still supported, but it is recommended that users migrate to the V2 protocol for better performance and standardization among serving runtimes. However, if a use case requires a more flexible schema than protocol v2 provides, v1 protocol is still an option. API Verb Path Request Payload Response Payload List Models GET /v1/models {\"models\": [<model_name>]} Model Ready GET /v1/models/<model_name> {\"name\": <model_name>,\"ready\": $bool} Predict POST /v1/models/<model_name>:predict {\"instances\": []} ** {\"predictions\": []} Explain POST /v1/models/<model_name>:explain {\"instances\": []} ** {\"predictions\": [], \"explanations\": []} ** = payload is optional Note: The response payload in V1 protocol is not strictly enforced. A custom server can define and return its own response payload. We encourage using the KServe defined response payload for consistency. API Definitions \u00b6 API Definition Predict The \"predict\" API performs inference on a model. The response is the prediction result. All InferenceServices speak the Tensorflow V1 HTTP API . Explain The \"explain\" API is an optional component that provides model explanations in addition to predictions. The standardized explainer interface is identical to the Tensorflow V1 HTTP API with the addition of an \":explain\" verb. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. If the model(s) is downloaded and ready to serve requests, the model ready endpoint returns the list of accessible (s). List Models The \"models\" API exposes a list of models in the model registry.","title":"V1 Inference Protocol"},{"location":"modelserving/data_plane/v1_protocol/#data-plane-v1","text":"KServe's V1 protocol offers a standardized prediction workflow across all model frameworks. This protocol version is still supported, but it is recommended that users migrate to the V2 protocol for better performance and standardization among serving runtimes. However, if a use case requires a more flexible schema than protocol v2 provides, v1 protocol is still an option. API Verb Path Request Payload Response Payload List Models GET /v1/models {\"models\": [<model_name>]} Model Ready GET /v1/models/<model_name> {\"name\": <model_name>,\"ready\": $bool} Predict POST /v1/models/<model_name>:predict {\"instances\": []} ** {\"predictions\": []} Explain POST /v1/models/<model_name>:explain {\"instances\": []} ** {\"predictions\": [], \"explanations\": []} ** = payload is optional Note: The response payload in V1 protocol is not strictly enforced. A custom server can define and return its own response payload. We encourage using the KServe defined response payload for consistency.","title":"Data Plane (V1)"},{"location":"modelserving/data_plane/v1_protocol/#api-definitions","text":"API Definition Predict The \"predict\" API performs inference on a model. The response is the prediction result. All InferenceServices speak the Tensorflow V1 HTTP API . Explain The \"explain\" API is an optional component that provides model explanations in addition to predictions. The standardized explainer interface is identical to the Tensorflow V1 HTTP API with the addition of an \":explain\" verb. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. If the model(s) is downloaded and ready to serve requests, the model ready endpoint returns the list of accessible (s). List Models The \"models\" API exposes a list of models in the model registry.","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/","text":"Open Inference Protocol (V2 Inference Protocol) \u00b6 For an inference server to be compliant with this protocol the server must implement the health, metadata, and inference V2 APIs . Optional features that are explicitly noted are not required. A compliant inference server may choose to implement the HTTP/REST API and/or the GRPC API . Check the model serving runtime table / the protocolVersion field in the runtime YAML to ensure V2 protocol is supported for model serving runtime that you are using. Note: For all API descriptions on this page, all strings in all contexts are case-sensitive. The V2 protocol supports an extension mechanism as a required part of the API, but this document does not propose any specific extensions. Any specific extensions will be proposed separately. Note on changes between V1 & V2 \u00b6 V2 protocol does not currently support the explain endpoint like V1 protocol does. If this is a feature you wish to have in the V2 protocol, please submit a github issue . HTTP/REST \u00b6 The HTTP/REST API uses JSON because it is widely supported and language independent. In all JSON schemas shown in this document $number, $string, $boolean, $object and $array refer to the fundamental JSON types. #optional indicates an optional JSON field. See also: The HTTP/REST endpoints are defined in rest_predict_v2.yaml API Verb Path Request Payload Response Payload Inference POST v2/models/ [/versions/<model_version>]/infer $inference_request $inference_response Model Metadata GET v2/models/<model_name>[/versions/<model_version>] $metadata_model_response Server Ready GET v2/health/ready $ready_server_response Server Live GET v2/health/live $live_server_response Server Metadata GET v2 $metadata_server_response Model Ready GET v2/models/<model_name>[/versions/ ]/ready $ready_model_response ** path contents in [] are optional For more information regarding payload contents, see Payload Contents . The versions portion of the Path URLs (in [] ) is shown as optional to allow implementations that don\u2019t support versioning or for cases when the user does not want to specify a specific model version (in which case the server will choose a version based on its own policies). For example, if a model does not implement a version, the Model Metadata request path could look like v2/model/my_model . If the model has been configured to implement a version, the request path could look something like v2/models/my_model/versions/v10 , where the version of the model is v10. API Definitions \u00b6 API Definition Inference The /infer endpoint performs inference on a model. The response is the prediction result. Model Metadata The \"model metadata\" API is a per-model endpoint that returns details about the model passed in the path. Server Ready The \u201cserver ready\u201d health API indicates if all the models are ready for inferencing. The \u201cserver ready\u201d health API can be used directly to implement the Kubernetes readinessProbe Server Live The \u201cserver live\u201d health API indicates if the inference server is able to receive and respond to metadata and inference requests. The \u201cserver live\u201d API can be used directly to implement the Kubernetes livenessProbe. Server Metadata The \"server metadata\" API returns details describing the server. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. The model name and (optionally) version must be available in the URL. Health/Readiness/Liveness Probes \u00b6 The Model Readiness probe the question \"Did the model download and is it able to serve requests?\" and responds with the available model name(s). The Server Readiness/Liveness probes answer the question \"Is my service and its infrastructure running, healthy, and able to receive and process requests?\" To read more about liveness and readiness probe concepts, visit the Configure Liveness, Readiness and Startup Probes Kubernetes documentation. Payload Contents \u00b6 Model Ready \u00b6 The model ready endpoint returns the readiness probe response for the server along with the name of the model. Model Ready Response JSON Object \u00b6 $ready_model_response = { \"name\" : $string, \"ready\": $bool } Server Ready \u00b6 The server ready endpoint returns the readiness probe response for the server. Server Ready Response JSON Object \u00b6 $ready_server_response = { \"live\" : $bool, } Server Live \u00b6 The server live endpoint returns the liveness probe response for the server. Server Live Response JSON Objet \u00b6 $live_server_response = { \"live\" : $bool, } Server Metadata \u00b6 The server metadata endpoint provides information about the server. A server metadata request is made with an HTTP GET to a server metadata endpoint. In the corresponding response the HTTP body contains the Server Metadata Response JSON Object or the Server Metadata Response JSON Error Object . Server Metadata Response JSON Object \u00b6 A successful server metadata request is indicated by a 200 HTTP status code. The server metadata response object, identified as $metadata_server_response , is returned in the HTTP body. $metadata_server_response = { \"name\" : $string, \"version\" : $string, \"extensions\" : [ $string, ... ] } \u201cname\u201d : A descriptive name for the server. \"version\" : The server version. \u201cextensions\u201d : The extensions supported by the server. Currently, no standard extensions are defined. Individual inference servers may define and document their own extensions. Server Metadata Response JSON Error Object \u00b6 A failed server metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_server_error_response object. $metadata_server_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. Model Metadata \u00b6 The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. Model Metadata Response JSON Object \u00b6 A successful model metadata request is indicated by a 200 HTTP status code. The metadata response object, identified as $metadata_model_response , is returned in the HTTP body for every successful model metadata request. $metadata_model_response = { \"name\" : $string, \"versions\" : [ $string, ... ] #optional, \"platform\" : $string, \"inputs\" : [ $metadata_tensor, ... ], \"outputs\" : [ $metadata_tensor, ... ] } \u201cname\u201d : The name of the model. \"versions\" : The model versions that may be explicitly requested via the appropriate endpoint. Optional for servers that don\u2019t support versions. Optional for models that don\u2019t allow a version to be explicitly requested. \u201cplatform\u201d : The framework/backend for the model. See Platforms . \u201cinputs\u201d : The inputs required by the model. \u201coutputs\u201d : The outputs produced by the model. Each model input and output tensors\u2019 metadata is described with a $metadata_tensor object . $metadata_tensor = { \"name\" : $string, \"datatype\" : $string, \"shape\" : [ $number, ... ] } \u201cname\u201d : The name of the tensor. \"datatype\" : The data-type of the tensor elements as defined in Tensor Data Types . \"shape\" : The shape of the tensor. Variable-size dimensions are specified as -1. Model Metadata Response JSON Error Object \u00b6 A failed model metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_model_error_response object. $metadata_model_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. Inference \u00b6 An inference request is made with an HTTP POST to an inference endpoint. In the request the HTTP body contains the Inference Request JSON Object . In the corresponding response the HTTP body contains the Inference Response JSON Object or Inference Response JSON Error Object . See Inference Request Examples for some example HTTP/REST requests and responses. Inference Request JSON Object \u00b6 The inference request object, identified as $inference_request , is required in the HTTP body of the POST request. The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. $inference_request = { \"id\" : $string #optional, \"parameters\" : $parameters #optional, \"inputs\" : [ $request_input, ... ], \"outputs\" : [ $request_output, ... ] #optional } \"id\" : An identifier for this request. Optional, but if specified this identifier must be returned in the response. \"parameters\" : An object containing zero or more parameters for this inference request expressed as key/value pairs. See Parameters for more information. \"inputs\" : The input tensors. Each input is described using the $request_input schema defined in Request Input . \"outputs\" : The output tensors requested for this inference. Each requested output is described using the $request_output schema defined in Request Output . Optional, if not specified all outputs produced by the model will be returned using default $request_output settings. Request Input \u00b6 The $inference_request_input JSON describes an input to the model. If the input is batched, the shape and data must represent the full shape and contents of the entire batch. $inference_request_input = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the input tensor. \"shape\" : The shape of the input tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the input tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information. Request Output \u00b6 The $request_output JSON is used to request which output tensors should be returned from the model. $inference_request_output = { \"name\" : $string, \"parameters\" : $parameters #optional, } \"name\" : The name of the output tensor. \"parameters\" : An object containing zero or more parameters for this output expressed as key/value pairs. See Parameters for more information. Inference Response JSON Object \u00b6 A successful inference request is indicated by a 200 HTTP status code. The inference response object, identified as $inference_response , is returned in the HTTP body. $inference_response = { \"model_name\" : $string, \"model_version\" : $string #optional, \"id\" : $string, \"parameters\" : $parameters #optional, \"outputs\" : [ $response_output, ... ] } \"model_name\" : The name of the model used for inference. \"model_version\" : The specific model version used for inference. Inference servers that do not implement versioning should not provide this field in the response. \"id\" : The \"id\" identifier given in the request, if any. \"parameters\" : An object containing zero or more parameters for this response expressed as key/value pairs. See Parameters for more information. \"outputs\" : The output tensors. Each output is described using the $response_output schema defined in Response Output . Response Output \u00b6 The $response_output JSON describes an output from the model. If the output is batched, the shape and data represents the full shape of the entire batch. $response_output = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the output tensor. \"shape\" : The shape of the output tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the output tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information. Inference Response JSON Error Object \u00b6 A failed inference request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $inference_error_response object. $inference_error_response = { \"error\": <error message string> } \u201cerror\u201d : The descriptive message for the error. Parameters \u00b6 The $parameters JSON describes zero or more \u201cname\u201d/\u201dvalue\u201d pairs, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a $string, $number, or $boolean. $parameters = { $parameter, ... } $parameter = $string : $string | $number | $boolean Currently no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. Tensor Data \u00b6 Tensor data must be presented in row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Tensor elements may be presented in their nature multi-dimensional representation, or as a flattened one-dimensional representation. Tensor data given explicitly is provided in a JSON array. Each element of the array may be an integer, floating-point number, string or boolean value. The server can decide to coerce each element to the required type or return an error if an unexpected value is received. Note that fp16 and bf16 are problematic to communicate explicitly since there is not a standard fp16/bf16 representation across backends nor typically the programmatic support to create the fp16/bf16 representation for a JSON number. For example, the 2-dimensional matrix: [ 1 2 4 5 ] Can be represented in its natural format as: \"data\" : [ [ 1, 2 ], [ 4, 5 ] ] Or in a flattened one-dimensional representation: \"data\" : [ 1, 2, 4, 5 ] Tensor Data Types \u00b6 Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 ) --- Inference Request Examples \u00b6 The following example shows an inference request to a model with two inputs and one output. The HTTP Content-Length header gives the size of the JSON object. POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/json Content-Length: <xx> { \"id\" : \"42\", \"inputs\" : [ { \"name\" : \"input0\", \"shape\" : [ 2, 2 ], \"datatype\" : \"UINT32\", \"data\" : [ 1, 2, 3, 4 ] }, { \"name\" : \"input1\", \"shape\" : [ 3 ], \"datatype\" : \"BOOL\", \"data\" : [ true ] } ], \"outputs\" : [ { \"name\" : \"output0\" } ] } For the above request the inference server must return the \u201coutput0\u201d output tensor. Assuming the model returns a [ 3, 2 ] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/json Content-Length: <yy> { \"id\" : \"42\" \"outputs\" : [ { \"name\" : \"output0\", \"shape\" : [ 3, 2 ], \"datatype\" : \"FP32\", \"data\" : [ 1.0, 1.1, 2.0, 2.1, 3.0, 3.1 ] } ] } gRPC \u00b6 The GRPC API closely follows the concepts defined in the HTTP/REST API. A compliant server must implement the health, metadata, and inference APIs described in this section. API rpc Endpoint Request Message Response Message Inference ModelInfer ModelInferRequest ModelInferResponse Model Ready ModelReady [ModelReadyRequest] ModelReadyResponse Model Metadata ModelMetadata ModelMetadataRequest ModelMetadataResponse Server Ready ServerReady ServerReadyRequest ServerReadyResponse Server Live ServerLive ServerLiveRequest ServerLiveResponse For more detailed information on each endpoint and its contents, see API Definitions and Message Contents . See also: The gRPC endpoints, request/response messages and contents are defined in grpc_predict_v2.proto API Definitions \u00b6 The GRPC definition of the service is: // // Inference Server GRPC endpoints. // service GRPCInferenceService { // Check liveness of the inference server. rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} // Check readiness of the inference server. rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} // Check readiness of a model in the inference server. rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} // Get server metadata. rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {} // Get model metadata. rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} // Perform inference using a specific model. rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} } Message Contents \u00b6 Health \u00b6 A health request is made using the ServerLive, ServerReady, or ModelReady endpoint. For each of these endpoints errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. Server Live \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. The request and response messages for ServerLive are: message ServerLiveRequest {} message ServerLiveResponse { // True if the inference server is live, false if not live. bool live = 1; } Server Ready \u00b6 The ServerReady API indicates if the server is ready for inferencing. The request and response messages for ServerReady are: message ServerReadyRequest {} message ServerReadyResponse { // True if the inference server is ready, false if not ready. bool ready = 1; } Model Ready \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. The request and response messages for ModelReady are: message ModelReadyRequest { // The name of the model to check for readiness. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelReadyResponse { // True if the model is ready, false if not ready. bool ready = 1; } Metadata \u00b6 Server Metadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ServerMetadata are: message ServerMetadataRequest {} message ServerMetadataResponse { // The server name. string name = 1; // The server version. string version = 2; // The extensions supported by the server. repeated string extensions = 3; } Model Metadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelMetadata are: message ModelMetadataRequest { // The name of the model. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelMetadataResponse { // Metadata for a tensor. message TensorMetadata { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. A variable-size dimension is represented // by a -1 value. repeated int64 shape = 3; } // The model name. string name = 1; // The versions of the model available on the server. repeated string versions = 2; // The model's platform. See Platforms. string platform = 3; // The model's inputs. repeated TensorMetadata inputs = 4; // The model's outputs. repeated TensorMetadata outputs = 5; } Platforms \u00b6 A platform is a string indicating a DL/ML framework or backend. Platform is returned as part of the response to a Model Metadata request but is information only. The proposed inference APIs are generic relative to the DL/ML framework used by a model and so a client does not need to know the platform of a given model to use the API. Platform names use the format \u201c _ \u201d. The following platform names are allowed: tensorrt_plan : A TensorRT model encoded as a serialized engine or \u201cplan\u201d. tensorflow_graphdef : A TensorFlow model encoded as a GraphDef. tensorflow_savedmodel : A TensorFlow model encoded as a SavedModel. onnx_onnxv1 : A ONNX model encoded for ONNX Runtime. pytorch_torchscript : A PyTorch model encoded as TorchScript. mxnet_mxnet: An MXNet model caffe2_netdef : A Caffe2 model encoded as a NetDef. Inference \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelInfer are: message ModelInferRequest { // An input tensor for an inference request. message InferInputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional inference input tensor parameters. map<string, InferParameter> parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference request. InferTensorContents contents = 5; } // An output tensor requested for an inference request. message InferRequestedOutputTensor { // The tensor name. string name = 1; // Optional requested output tensor parameters. map<string, InferParameter> parameters = 2; } // The name of the model to use for inferencing. string model_name = 1; // The version of the model to use for inference. If not given the // server will choose a version based on the model and internal policy. string model_version = 2; // Optional identifier for the request. If specified will be // returned in the response. string id = 3; // Optional inference parameters. map<string, InferParameter> parameters = 4; // The input tensors for the inference. repeated InferInputTensor inputs = 5; // The requested output tensors for the inference. Optional, if not // specified all outputs produced by the model will be returned. repeated InferRequestedOutputTensor outputs = 6; // The data contained in an input tensor can be represented in \"raw\" // bytes form or in the repeated type that matches the tensor's data // type. To use the raw representation 'raw_input_contents' must be // initialized with data for each tensor in the same order as // 'inputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferInputTensor::contents must // not be specified for any input tensor. repeated bytes raw_input_contents = 7; } message ModelInferResponse { // An output tensor returned for an inference request. message InferOutputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional output tensor parameters. map<string, InferParameter> parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference response. InferTensorContents contents = 5; } // The name of the model used for inference. string model_name = 1; // The version of the model used for inference. string model_version = 2; // The id of the inference request if one was specified. string id = 3; // Optional inference response parameters. map<string, InferParameter> parameters = 4; // The output tensors holding inference results. repeated InferOutputTensor outputs = 5; // The data contained in an output tensor can be represented in // \"raw\" bytes form or in the repeated type that matches the // tensor's data type. To use the raw representation 'raw_output_contents' // must be initialized with data for each tensor in the same order as // 'outputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferOutputTensor::contents must // not be specified for any output tensor. repeated bytes raw_output_contents = 6; } Parameters \u00b6 The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Currently, no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. // // An inference parameter value. // message InferParameter { // The parameter value can be a string, an int64, a boolean // or a message specific to a predefined parameter. oneof parameter_choice { // A boolean parameter value. bool bool_param = 1; // An int64 parameter value. int64 int64_param = 2; // A string parameter value. string string_param = 3; } } Tensor Data \u00b6 In all representations tensor data must be flattened to a one-dimensional, row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Using a \"raw\" representation of tensors with ModelInferRequest::raw_input_contents and ModelInferResponse::raw_output_contents will typically allow higher performance due to the way protobuf allocation and reuse interacts with GRPC. For example, see https://github.com/grpc/grpc/issues/23231. An alternative to the \"raw\" representation is to use InferTensorContents to represent the tensor data in a format that matches the tensor's data type. // // The data contained in a tensor represented by the repeated type // that matches the tensor's data type. Protobuf oneof is not used // because oneofs cannot contain repeated fields. // message InferTensorContents { // Representation for BOOL data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bool bool_contents = 1; // Representation for INT8, INT16, and INT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated int32 int_contents = 2; // Representation for INT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated int64 int64_contents = 3; // Representation for UINT8, UINT16, and UINT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated uint32 uint_contents = 4; // Representation for UINT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated uint64 uint64_contents = 5; // Representation for FP32 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated float fp32_contents = 6; // Representation for FP64 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated double fp64_contents = 7; // Representation for BYTES data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bytes bytes_contents = 8; } Tensor Data Types \u00b6 Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 )","title":"Open Inference Protocol (V2 Inference Protocol)"},{"location":"modelserving/data_plane/v2_protocol/#open-inference-protocol-v2-inference-protocol","text":"For an inference server to be compliant with this protocol the server must implement the health, metadata, and inference V2 APIs . Optional features that are explicitly noted are not required. A compliant inference server may choose to implement the HTTP/REST API and/or the GRPC API . Check the model serving runtime table / the protocolVersion field in the runtime YAML to ensure V2 protocol is supported for model serving runtime that you are using. Note: For all API descriptions on this page, all strings in all contexts are case-sensitive. The V2 protocol supports an extension mechanism as a required part of the API, but this document does not propose any specific extensions. Any specific extensions will be proposed separately.","title":"Open Inference Protocol (V2 Inference Protocol)"},{"location":"modelserving/data_plane/v2_protocol/#note-on-changes-between-v1-v2","text":"V2 protocol does not currently support the explain endpoint like V1 protocol does. If this is a feature you wish to have in the V2 protocol, please submit a github issue .","title":"Note on changes between V1 &amp; V2"},{"location":"modelserving/data_plane/v2_protocol/#httprest","text":"The HTTP/REST API uses JSON because it is widely supported and language independent. In all JSON schemas shown in this document $number, $string, $boolean, $object and $array refer to the fundamental JSON types. #optional indicates an optional JSON field. See also: The HTTP/REST endpoints are defined in rest_predict_v2.yaml API Verb Path Request Payload Response Payload Inference POST v2/models/ [/versions/<model_version>]/infer $inference_request $inference_response Model Metadata GET v2/models/<model_name>[/versions/<model_version>] $metadata_model_response Server Ready GET v2/health/ready $ready_server_response Server Live GET v2/health/live $live_server_response Server Metadata GET v2 $metadata_server_response Model Ready GET v2/models/<model_name>[/versions/ ]/ready $ready_model_response ** path contents in [] are optional For more information regarding payload contents, see Payload Contents . The versions portion of the Path URLs (in [] ) is shown as optional to allow implementations that don\u2019t support versioning or for cases when the user does not want to specify a specific model version (in which case the server will choose a version based on its own policies). For example, if a model does not implement a version, the Model Metadata request path could look like v2/model/my_model . If the model has been configured to implement a version, the request path could look something like v2/models/my_model/versions/v10 , where the version of the model is v10.","title":"HTTP/REST"},{"location":"modelserving/data_plane/v2_protocol/#api-definitions","text":"API Definition Inference The /infer endpoint performs inference on a model. The response is the prediction result. Model Metadata The \"model metadata\" API is a per-model endpoint that returns details about the model passed in the path. Server Ready The \u201cserver ready\u201d health API indicates if all the models are ready for inferencing. The \u201cserver ready\u201d health API can be used directly to implement the Kubernetes readinessProbe Server Live The \u201cserver live\u201d health API indicates if the inference server is able to receive and respond to metadata and inference requests. The \u201cserver live\u201d API can be used directly to implement the Kubernetes livenessProbe. Server Metadata The \"server metadata\" API returns details describing the server. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. The model name and (optionally) version must be available in the URL.","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/#healthreadinessliveness-probes","text":"The Model Readiness probe the question \"Did the model download and is it able to serve requests?\" and responds with the available model name(s). The Server Readiness/Liveness probes answer the question \"Is my service and its infrastructure running, healthy, and able to receive and process requests?\" To read more about liveness and readiness probe concepts, visit the Configure Liveness, Readiness and Startup Probes Kubernetes documentation.","title":"Health/Readiness/Liveness Probes"},{"location":"modelserving/data_plane/v2_protocol/#payload-contents","text":"","title":"Payload Contents"},{"location":"modelserving/data_plane/v2_protocol/#model-ready","text":"The model ready endpoint returns the readiness probe response for the server along with the name of the model.","title":"Model Ready"},{"location":"modelserving/data_plane/v2_protocol/#model-ready-response-json-object","text":"$ready_model_response = { \"name\" : $string, \"ready\": $bool }","title":"Model Ready Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-ready","text":"The server ready endpoint returns the readiness probe response for the server.","title":"Server Ready"},{"location":"modelserving/data_plane/v2_protocol/#server-ready-response-json-object","text":"$ready_server_response = { \"live\" : $bool, }","title":"Server Ready Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-live","text":"The server live endpoint returns the liveness probe response for the server.","title":"Server Live"},{"location":"modelserving/data_plane/v2_protocol/#server-live-response-json-objet","text":"$live_server_response = { \"live\" : $bool, }","title":"Server Live Response JSON Objet"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata","text":"The server metadata endpoint provides information about the server. A server metadata request is made with an HTTP GET to a server metadata endpoint. In the corresponding response the HTTP body contains the Server Metadata Response JSON Object or the Server Metadata Response JSON Error Object .","title":"Server Metadata"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata-response-json-object","text":"A successful server metadata request is indicated by a 200 HTTP status code. The server metadata response object, identified as $metadata_server_response , is returned in the HTTP body. $metadata_server_response = { \"name\" : $string, \"version\" : $string, \"extensions\" : [ $string, ... ] } \u201cname\u201d : A descriptive name for the server. \"version\" : The server version. \u201cextensions\u201d : The extensions supported by the server. Currently, no standard extensions are defined. Individual inference servers may define and document their own extensions.","title":"Server Metadata Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata-response-json-error-object","text":"A failed server metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_server_error_response object. $metadata_server_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error.","title":"Server Metadata Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata","text":"The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error.","title":"Model Metadata"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata-response-json-object","text":"A successful model metadata request is indicated by a 200 HTTP status code. The metadata response object, identified as $metadata_model_response , is returned in the HTTP body for every successful model metadata request. $metadata_model_response = { \"name\" : $string, \"versions\" : [ $string, ... ] #optional, \"platform\" : $string, \"inputs\" : [ $metadata_tensor, ... ], \"outputs\" : [ $metadata_tensor, ... ] } \u201cname\u201d : The name of the model. \"versions\" : The model versions that may be explicitly requested via the appropriate endpoint. Optional for servers that don\u2019t support versions. Optional for models that don\u2019t allow a version to be explicitly requested. \u201cplatform\u201d : The framework/backend for the model. See Platforms . \u201cinputs\u201d : The inputs required by the model. \u201coutputs\u201d : The outputs produced by the model. Each model input and output tensors\u2019 metadata is described with a $metadata_tensor object . $metadata_tensor = { \"name\" : $string, \"datatype\" : $string, \"shape\" : [ $number, ... ] } \u201cname\u201d : The name of the tensor. \"datatype\" : The data-type of the tensor elements as defined in Tensor Data Types . \"shape\" : The shape of the tensor. Variable-size dimensions are specified as -1.","title":"Model Metadata Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata-response-json-error-object","text":"A failed model metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_model_error_response object. $metadata_model_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error.","title":"Model Metadata Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#inference","text":"An inference request is made with an HTTP POST to an inference endpoint. In the request the HTTP body contains the Inference Request JSON Object . In the corresponding response the HTTP body contains the Inference Response JSON Object or Inference Response JSON Error Object . See Inference Request Examples for some example HTTP/REST requests and responses.","title":"Inference"},{"location":"modelserving/data_plane/v2_protocol/#inference-request-json-object","text":"The inference request object, identified as $inference_request , is required in the HTTP body of the POST request. The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. $inference_request = { \"id\" : $string #optional, \"parameters\" : $parameters #optional, \"inputs\" : [ $request_input, ... ], \"outputs\" : [ $request_output, ... ] #optional } \"id\" : An identifier for this request. Optional, but if specified this identifier must be returned in the response. \"parameters\" : An object containing zero or more parameters for this inference request expressed as key/value pairs. See Parameters for more information. \"inputs\" : The input tensors. Each input is described using the $request_input schema defined in Request Input . \"outputs\" : The output tensors requested for this inference. Each requested output is described using the $request_output schema defined in Request Output . Optional, if not specified all outputs produced by the model will be returned using default $request_output settings.","title":"Inference Request JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#request-input","text":"The $inference_request_input JSON describes an input to the model. If the input is batched, the shape and data must represent the full shape and contents of the entire batch. $inference_request_input = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the input tensor. \"shape\" : The shape of the input tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the input tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information.","title":"Request Input"},{"location":"modelserving/data_plane/v2_protocol/#request-output","text":"The $request_output JSON is used to request which output tensors should be returned from the model. $inference_request_output = { \"name\" : $string, \"parameters\" : $parameters #optional, } \"name\" : The name of the output tensor. \"parameters\" : An object containing zero or more parameters for this output expressed as key/value pairs. See Parameters for more information.","title":"Request Output"},{"location":"modelserving/data_plane/v2_protocol/#inference-response-json-object","text":"A successful inference request is indicated by a 200 HTTP status code. The inference response object, identified as $inference_response , is returned in the HTTP body. $inference_response = { \"model_name\" : $string, \"model_version\" : $string #optional, \"id\" : $string, \"parameters\" : $parameters #optional, \"outputs\" : [ $response_output, ... ] } \"model_name\" : The name of the model used for inference. \"model_version\" : The specific model version used for inference. Inference servers that do not implement versioning should not provide this field in the response. \"id\" : The \"id\" identifier given in the request, if any. \"parameters\" : An object containing zero or more parameters for this response expressed as key/value pairs. See Parameters for more information. \"outputs\" : The output tensors. Each output is described using the $response_output schema defined in Response Output .","title":"Inference Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#response-output","text":"The $response_output JSON describes an output from the model. If the output is batched, the shape and data represents the full shape of the entire batch. $response_output = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the output tensor. \"shape\" : The shape of the output tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the output tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information.","title":"Response Output"},{"location":"modelserving/data_plane/v2_protocol/#inference-response-json-error-object","text":"A failed inference request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $inference_error_response object. $inference_error_response = { \"error\": <error message string> } \u201cerror\u201d : The descriptive message for the error.","title":"Inference Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#parameters","text":"The $parameters JSON describes zero or more \u201cname\u201d/\u201dvalue\u201d pairs, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a $string, $number, or $boolean. $parameters = { $parameter, ... } $parameter = $string : $string | $number | $boolean Currently no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities.","title":"Parameters"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data","text":"Tensor data must be presented in row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Tensor elements may be presented in their nature multi-dimensional representation, or as a flattened one-dimensional representation. Tensor data given explicitly is provided in a JSON array. Each element of the array may be an integer, floating-point number, string or boolean value. The server can decide to coerce each element to the required type or return an error if an unexpected value is received. Note that fp16 and bf16 are problematic to communicate explicitly since there is not a standard fp16/bf16 representation across backends nor typically the programmatic support to create the fp16/bf16 representation for a JSON number. For example, the 2-dimensional matrix: [ 1 2 4 5 ] Can be represented in its natural format as: \"data\" : [ [ 1, 2 ], [ 4, 5 ] ] Or in a flattened one-dimensional representation: \"data\" : [ 1, 2, 4, 5 ]","title":"Tensor Data"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data-types","text":"Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 ) ---","title":"Tensor Data Types"},{"location":"modelserving/data_plane/v2_protocol/#inference-request-examples","text":"The following example shows an inference request to a model with two inputs and one output. The HTTP Content-Length header gives the size of the JSON object. POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/json Content-Length: <xx> { \"id\" : \"42\", \"inputs\" : [ { \"name\" : \"input0\", \"shape\" : [ 2, 2 ], \"datatype\" : \"UINT32\", \"data\" : [ 1, 2, 3, 4 ] }, { \"name\" : \"input1\", \"shape\" : [ 3 ], \"datatype\" : \"BOOL\", \"data\" : [ true ] } ], \"outputs\" : [ { \"name\" : \"output0\" } ] } For the above request the inference server must return the \u201coutput0\u201d output tensor. Assuming the model returns a [ 3, 2 ] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/json Content-Length: <yy> { \"id\" : \"42\" \"outputs\" : [ { \"name\" : \"output0\", \"shape\" : [ 3, 2 ], \"datatype\" : \"FP32\", \"data\" : [ 1.0, 1.1, 2.0, 2.1, 3.0, 3.1 ] } ] }","title":"Inference Request Examples"},{"location":"modelserving/data_plane/v2_protocol/#grpc","text":"The GRPC API closely follows the concepts defined in the HTTP/REST API. A compliant server must implement the health, metadata, and inference APIs described in this section. API rpc Endpoint Request Message Response Message Inference ModelInfer ModelInferRequest ModelInferResponse Model Ready ModelReady [ModelReadyRequest] ModelReadyResponse Model Metadata ModelMetadata ModelMetadataRequest ModelMetadataResponse Server Ready ServerReady ServerReadyRequest ServerReadyResponse Server Live ServerLive ServerLiveRequest ServerLiveResponse For more detailed information on each endpoint and its contents, see API Definitions and Message Contents . See also: The gRPC endpoints, request/response messages and contents are defined in grpc_predict_v2.proto","title":"gRPC"},{"location":"modelserving/data_plane/v2_protocol/#api-definitions_1","text":"The GRPC definition of the service is: // // Inference Server GRPC endpoints. // service GRPCInferenceService { // Check liveness of the inference server. rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} // Check readiness of the inference server. rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} // Check readiness of a model in the inference server. rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} // Get server metadata. rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {} // Get model metadata. rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} // Perform inference using a specific model. rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} }","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/#message-contents","text":"","title":"Message Contents"},{"location":"modelserving/data_plane/v2_protocol/#health","text":"A health request is made using the ServerLive, ServerReady, or ModelReady endpoint. For each of these endpoints errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure.","title":"Health"},{"location":"modelserving/data_plane/v2_protocol/#server-live_1","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. The request and response messages for ServerLive are: message ServerLiveRequest {} message ServerLiveResponse { // True if the inference server is live, false if not live. bool live = 1; }","title":"Server Live"},{"location":"modelserving/data_plane/v2_protocol/#server-ready_1","text":"The ServerReady API indicates if the server is ready for inferencing. The request and response messages for ServerReady are: message ServerReadyRequest {} message ServerReadyResponse { // True if the inference server is ready, false if not ready. bool ready = 1; }","title":"Server Ready"},{"location":"modelserving/data_plane/v2_protocol/#model-ready_1","text":"The ModelReady API indicates if a specific model is ready for inferencing. The request and response messages for ModelReady are: message ModelReadyRequest { // The name of the model to check for readiness. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelReadyResponse { // True if the model is ready, false if not ready. bool ready = 1; }","title":"Model Ready"},{"location":"modelserving/data_plane/v2_protocol/#metadata","text":"","title":"Metadata"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata_1","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ServerMetadata are: message ServerMetadataRequest {} message ServerMetadataResponse { // The server name. string name = 1; // The server version. string version = 2; // The extensions supported by the server. repeated string extensions = 3; }","title":"Server Metadata"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata_1","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelMetadata are: message ModelMetadataRequest { // The name of the model. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelMetadataResponse { // Metadata for a tensor. message TensorMetadata { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. A variable-size dimension is represented // by a -1 value. repeated int64 shape = 3; } // The model name. string name = 1; // The versions of the model available on the server. repeated string versions = 2; // The model's platform. See Platforms. string platform = 3; // The model's inputs. repeated TensorMetadata inputs = 4; // The model's outputs. repeated TensorMetadata outputs = 5; }","title":"Model Metadata"},{"location":"modelserving/data_plane/v2_protocol/#platforms","text":"A platform is a string indicating a DL/ML framework or backend. Platform is returned as part of the response to a Model Metadata request but is information only. The proposed inference APIs are generic relative to the DL/ML framework used by a model and so a client does not need to know the platform of a given model to use the API. Platform names use the format \u201c _ \u201d. The following platform names are allowed: tensorrt_plan : A TensorRT model encoded as a serialized engine or \u201cplan\u201d. tensorflow_graphdef : A TensorFlow model encoded as a GraphDef. tensorflow_savedmodel : A TensorFlow model encoded as a SavedModel. onnx_onnxv1 : A ONNX model encoded for ONNX Runtime. pytorch_torchscript : A PyTorch model encoded as TorchScript. mxnet_mxnet: An MXNet model caffe2_netdef : A Caffe2 model encoded as a NetDef.","title":"Platforms"},{"location":"modelserving/data_plane/v2_protocol/#inference_1","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelInfer are: message ModelInferRequest { // An input tensor for an inference request. message InferInputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional inference input tensor parameters. map<string, InferParameter> parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference request. InferTensorContents contents = 5; } // An output tensor requested for an inference request. message InferRequestedOutputTensor { // The tensor name. string name = 1; // Optional requested output tensor parameters. map<string, InferParameter> parameters = 2; } // The name of the model to use for inferencing. string model_name = 1; // The version of the model to use for inference. If not given the // server will choose a version based on the model and internal policy. string model_version = 2; // Optional identifier for the request. If specified will be // returned in the response. string id = 3; // Optional inference parameters. map<string, InferParameter> parameters = 4; // The input tensors for the inference. repeated InferInputTensor inputs = 5; // The requested output tensors for the inference. Optional, if not // specified all outputs produced by the model will be returned. repeated InferRequestedOutputTensor outputs = 6; // The data contained in an input tensor can be represented in \"raw\" // bytes form or in the repeated type that matches the tensor's data // type. To use the raw representation 'raw_input_contents' must be // initialized with data for each tensor in the same order as // 'inputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferInputTensor::contents must // not be specified for any input tensor. repeated bytes raw_input_contents = 7; } message ModelInferResponse { // An output tensor returned for an inference request. message InferOutputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional output tensor parameters. map<string, InferParameter> parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference response. InferTensorContents contents = 5; } // The name of the model used for inference. string model_name = 1; // The version of the model used for inference. string model_version = 2; // The id of the inference request if one was specified. string id = 3; // Optional inference response parameters. map<string, InferParameter> parameters = 4; // The output tensors holding inference results. repeated InferOutputTensor outputs = 5; // The data contained in an output tensor can be represented in // \"raw\" bytes form or in the repeated type that matches the // tensor's data type. To use the raw representation 'raw_output_contents' // must be initialized with data for each tensor in the same order as // 'outputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferOutputTensor::contents must // not be specified for any output tensor. repeated bytes raw_output_contents = 6; }","title":"Inference"},{"location":"modelserving/data_plane/v2_protocol/#parameters_1","text":"The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Currently, no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. // // An inference parameter value. // message InferParameter { // The parameter value can be a string, an int64, a boolean // or a message specific to a predefined parameter. oneof parameter_choice { // A boolean parameter value. bool bool_param = 1; // An int64 parameter value. int64 int64_param = 2; // A string parameter value. string string_param = 3; } }","title":"Parameters"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data_1","text":"In all representations tensor data must be flattened to a one-dimensional, row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Using a \"raw\" representation of tensors with ModelInferRequest::raw_input_contents and ModelInferResponse::raw_output_contents will typically allow higher performance due to the way protobuf allocation and reuse interacts with GRPC. For example, see https://github.com/grpc/grpc/issues/23231. An alternative to the \"raw\" representation is to use InferTensorContents to represent the tensor data in a format that matches the tensor's data type. // // The data contained in a tensor represented by the repeated type // that matches the tensor's data type. Protobuf oneof is not used // because oneofs cannot contain repeated fields. // message InferTensorContents { // Representation for BOOL data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bool bool_contents = 1; // Representation for INT8, INT16, and INT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated int32 int_contents = 2; // Representation for INT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated int64 int64_contents = 3; // Representation for UINT8, UINT16, and UINT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated uint32 uint_contents = 4; // Representation for UINT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated uint64 uint64_contents = 5; // Representation for FP32 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated float fp32_contents = 6; // Representation for FP64 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated double fp64_contents = 7; // Representation for BYTES data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bytes bytes_contents = 8; }","title":"Tensor Data"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data-types_1","text":"Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 )","title":"Tensor Data Types"},{"location":"modelserving/detect/aif/germancredit/","text":"Bias detection on an InferenceService using AIF360 \u00b6 This is an example of how to get bias metrics using AI Fairness 360 (AIF360) on KServe. AI Fairness 360, an LF AI incubation project, is an extensible open source toolkit that can help users examine, report, and mitigate discrimination and bias in machine learning models throughout the AI application lifecycle. We will be using the German Credit dataset maintained by the UC Irvine Machine Learning Repository . The German Credit dataset is a dataset that contains data as to whether or not a creditor gave a loan applicant access to a loan along with data about the applicant. The data includes relevant data on an applicant's credit history, savings, and employment as well as some data on the applicant's demographic such as age, sex, and marital status. Data like credit history, savings, and employment can be used by creditors to accurately predict the probability that an applicant will repay their loans, however, data such as age and sex should not be used to decide whether an applicant should be given a loan. We would like to be able to check if these \"protected classes\" are being used in a model's predictions. In this example we will feed the model some predictions and calculate metrics based off of the predictions the model makes. We will be using KServe payload logging capability collect the metrics. These metrics will give insight as to whether or not the model is biased for or against any protected classes. In this example we will look at the bias our deployed model has on those of age > 25 vs. those of age <= 25 and see if creditors are treating either unfairly. Sample resources for deploying the example can be found here Create the InferenceService \u00b6 Apply the CRD kubectl kubectl apply -f bias.yaml Expected Output $ inferenceservice.serving.kserve.io/german-credit created Deploy the message dumper (sample backend receiver for payload logs) \u00b6 Apply the message-dumper CRD which will collect the logs that are created when running predictions on the inferenceservice. In production setup, instead of message-dumper Kafka can be used to receive payload logs kubectl kubectl apply -f message-dumper.yaml Expected Output service.serving.knative.dev/message-dumper created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = german-credit SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python simulate_predicts.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict ${ SERVICE_HOSTNAME } Process payload logs for metrics calculation \u00b6 Run json_from_logs.py which will craft a payload that AIF can interpret. First, the events logs are taken from the message-dumper and then those logs are parsed to match inputs with outputs. Then the input/outputs pairs are all combined into a list of inputs and a list of outputs for AIF to interpret. A data.json file should have been created in this folder which contains the json payload. python json_from_logs.py Run an explanation \u00b6 Finally, now that we have collected a number of our model's predictions and their corresponding inputs we will send these to the AIF server to calculate the bias metrics. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Interpreting the results \u00b6 Now let's look at one of the metrics. In this example disparate impact represents the ratio between the probability of applicants of the privileged class (age > 25) getting a loan and the probability of applicants of the unprivileged class (age <= 25) getting a loan P(Y=1|D=privileged)/P(Y=1|D=unprivileged) . Since, in the sample output below, the disparate impact is less that 1 then the probability that an applicant whose age is greater than 25 gets a loan is significantly higher than the probability that an applicant whose age is less than or equal to 25 gets a loan. This in and of itself is not proof that the model is biased, but does hint that there may be some bias and a deeper look may be needed. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Expected Output Sending bias query... TIME TAKEN: 0 .21137404441833496 <Response [ 200 ] > base_rate : 0 .9329608938547486 consistency : [ 0 .982122905027933 ] disparate_impact : 0 .52 num_instances : 179 .0 num_negatives : 12 .0 num_positives : 167 .0 statistical_parity_difference : -0.48 Dataset \u00b6 The dataset used in this example is the German Credit dataset maintained by the UC Irvine Machine Learning Repository . Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.","title":"AIF Bias Detector"},{"location":"modelserving/detect/aif/germancredit/#bias-detection-on-an-inferenceservice-using-aif360","text":"This is an example of how to get bias metrics using AI Fairness 360 (AIF360) on KServe. AI Fairness 360, an LF AI incubation project, is an extensible open source toolkit that can help users examine, report, and mitigate discrimination and bias in machine learning models throughout the AI application lifecycle. We will be using the German Credit dataset maintained by the UC Irvine Machine Learning Repository . The German Credit dataset is a dataset that contains data as to whether or not a creditor gave a loan applicant access to a loan along with data about the applicant. The data includes relevant data on an applicant's credit history, savings, and employment as well as some data on the applicant's demographic such as age, sex, and marital status. Data like credit history, savings, and employment can be used by creditors to accurately predict the probability that an applicant will repay their loans, however, data such as age and sex should not be used to decide whether an applicant should be given a loan. We would like to be able to check if these \"protected classes\" are being used in a model's predictions. In this example we will feed the model some predictions and calculate metrics based off of the predictions the model makes. We will be using KServe payload logging capability collect the metrics. These metrics will give insight as to whether or not the model is biased for or against any protected classes. In this example we will look at the bias our deployed model has on those of age > 25 vs. those of age <= 25 and see if creditors are treating either unfairly. Sample resources for deploying the example can be found here","title":"Bias detection on an InferenceService using AIF360"},{"location":"modelserving/detect/aif/germancredit/#create-the-inferenceservice","text":"Apply the CRD kubectl kubectl apply -f bias.yaml Expected Output $ inferenceservice.serving.kserve.io/german-credit created","title":"Create the InferenceService"},{"location":"modelserving/detect/aif/germancredit/#deploy-the-message-dumper-sample-backend-receiver-for-payload-logs","text":"Apply the message-dumper CRD which will collect the logs that are created when running predictions on the inferenceservice. In production setup, instead of message-dumper Kafka can be used to receive payload logs kubectl kubectl apply -f message-dumper.yaml Expected Output service.serving.knative.dev/message-dumper created","title":"Deploy the message dumper (sample backend receiver for payload logs)"},{"location":"modelserving/detect/aif/germancredit/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = german-credit SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python simulate_predicts.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict ${ SERVICE_HOSTNAME }","title":"Run a prediction"},{"location":"modelserving/detect/aif/germancredit/#process-payload-logs-for-metrics-calculation","text":"Run json_from_logs.py which will craft a payload that AIF can interpret. First, the events logs are taken from the message-dumper and then those logs are parsed to match inputs with outputs. Then the input/outputs pairs are all combined into a list of inputs and a list of outputs for AIF to interpret. A data.json file should have been created in this folder which contains the json payload. python json_from_logs.py","title":"Process payload logs for metrics calculation"},{"location":"modelserving/detect/aif/germancredit/#run-an-explanation","text":"Finally, now that we have collected a number of our model's predictions and their corresponding inputs we will send these to the AIF server to calculate the bias metrics. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json","title":"Run an explanation"},{"location":"modelserving/detect/aif/germancredit/#interpreting-the-results","text":"Now let's look at one of the metrics. In this example disparate impact represents the ratio between the probability of applicants of the privileged class (age > 25) getting a loan and the probability of applicants of the unprivileged class (age <= 25) getting a loan P(Y=1|D=privileged)/P(Y=1|D=unprivileged) . Since, in the sample output below, the disparate impact is less that 1 then the probability that an applicant whose age is greater than 25 gets a loan is significantly higher than the probability that an applicant whose age is less than or equal to 25 gets a loan. This in and of itself is not proof that the model is biased, but does hint that there may be some bias and a deeper look may be needed. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Expected Output Sending bias query... TIME TAKEN: 0 .21137404441833496 <Response [ 200 ] > base_rate : 0 .9329608938547486 consistency : [ 0 .982122905027933 ] disparate_impact : 0 .52 num_instances : 179 .0 num_negatives : 12 .0 num_positives : 167 .0 statistical_parity_difference : -0.48","title":"Interpreting the results"},{"location":"modelserving/detect/aif/germancredit/#dataset","text":"The dataset used in this example is the German Credit dataset maintained by the UC Irvine Machine Learning Repository . Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.","title":"Dataset"},{"location":"modelserving/detect/aif/germancredit/server/","text":"Logistic Regression Model on the German Credit dataset \u00b6 Build a development docker image \u00b6 To build a development image first download these files and move them into the server/ folder - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc First build your docker image by changing directory to kserve/python and replacing dockeruser with your docker username in the snippet below (running this will take some time). docker build -t dockeruser/aifserver:latest -f aiffairness.Dockerfile . Then push your docker image to your dockerhub repo (this will take some time) docker push dockeruser/aifserver:latest Once your docker image is pushed you can pull the image from dockeruser/aifserver:latest when deploying an inferenceservice by specifying the image in the yaml file.","title":"Logistic Regression Model on the German Credit dataset"},{"location":"modelserving/detect/aif/germancredit/server/#logistic-regression-model-on-the-german-credit-dataset","text":"","title":"Logistic Regression Model on the German Credit dataset"},{"location":"modelserving/detect/aif/germancredit/server/#build-a-development-docker-image","text":"To build a development image first download these files and move them into the server/ folder - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc First build your docker image by changing directory to kserve/python and replacing dockeruser with your docker username in the snippet below (running this will take some time). docker build -t dockeruser/aifserver:latest -f aiffairness.Dockerfile . Then push your docker image to your dockerhub repo (this will take some time) docker push dockeruser/aifserver:latest Once your docker image is pushed you can pull the image from dockeruser/aifserver:latest when deploying an inferenceservice by specifying the image in the yaml file.","title":"Build a development docker image"},{"location":"modelserving/detect/alibi_detect/alibi_detect/","text":"Deploy InferenceService with Alibi Outlier/Drift Detector \u00b6 In order to trust and reliably act on model predictions, it is crucial to monitor the distribution of the incoming requests via various different type of detectors. KServe integrates Alibi Detect with the following components: Drift detector checks when the distribution of incoming requests is diverging from a reference distribution such as that of the training data. Outlier detector flags single instances which do not follow the training distribution. The architecture used is shown below and links the payload logging available within KServe with asynchronous processing of those payloads in KNative to detect outliers. CIFAR10 Outlier Detector \u00b6 A CIFAR10 Outlier Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18. CIFAR10 Drift Detector \u00b6 A CIFAR10 Drift Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"Alibi Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#deploy-inferenceservice-with-alibi-outlierdrift-detector","text":"In order to trust and reliably act on model predictions, it is crucial to monitor the distribution of the incoming requests via various different type of detectors. KServe integrates Alibi Detect with the following components: Drift detector checks when the distribution of incoming requests is diverging from a reference distribution such as that of the training data. Outlier detector flags single instances which do not follow the training distribution. The architecture used is shown below and links the payload logging available within KServe with asynchronous processing of those payloads in KNative to detect outliers.","title":"Deploy InferenceService with Alibi Outlier/Drift Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#cifar10-outlier-detector","text":"A CIFAR10 Outlier Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"CIFAR10 Outlier Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#cifar10-drift-detector","text":"A CIFAR10 Drift Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"CIFAR10 Drift Detector"},{"location":"modelserving/detect/art/mnist/","text":"Using ART to get adversarial examples for MNIST classifications \u00b6 This is an example to show how adversarially modified inputs can trick models to predict incorrectly to highlight model vulnerability to adversarial attacks. It is using the Adversarial Robustness Toolbox (ART) on KServe. ART provides tools that enable developers to evaluate, defend, and verify ML models and applications against adversarial threats. Apart from giving capabilities to craft adversarial attacks , it also provides algorithms to defend against them. We will be using the MNIST dataset which is a dataset of handwritten digits and find adversarial examples which can make the model predict a classification incorrectly, thereby showing the vulnerability of the model against adversarial attacks. Sample resources for deploying the example can be found here To deploy the inferenceservice with v1beta1 API kubectl apply -f art.yaml Then find the url kubectl get inferenceservice NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE artserver http://artserver.somecluster/v1/models/artserver True 100 40m Explanation \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = artserver SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After some time you should see a pop up containing the explanation, similar to the image below. If a pop up does not display and the message \"Unable to find an adversarial example.\" appears then an adversarial example could not be found for the image given in a timely manner. If a pop up does display then the image on the left is the original image and the image on the right is the adversarial example. The labels above both images represent what classification the model made for each individual image. The Square Attack method used in this example creates a random update at each iteration and adds this update to the adversarial input if it makes a misclassification more likely (more specifically, if it improves the objective function). Once enough random updates are added together and the model misclassifies then the resulting adversarial input will be returned and displayed. To try a different MNIST example add an integer to the end of the query between 0-9,999. The integer chosen will be the index of the image to be chosen in the MNIST dataset. Or to try a file with custom data add the file path to the end. Keep in mind that the data format must be {\"instances\": [<image>, <label>]} python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } ./input.json Stopping the Inference Service \u00b6 kubectl delete -f art.yaml Build a Development ART Explainer Docker Image \u00b6 If you would like to build a development image for the ART Explainer then follow these instructions Troubleshooting \u00b6 <504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to art.yaml and increase resources. If you see Configuration \"artserver-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"ART Adversarial Detector"},{"location":"modelserving/detect/art/mnist/#using-art-to-get-adversarial-examples-for-mnist-classifications","text":"This is an example to show how adversarially modified inputs can trick models to predict incorrectly to highlight model vulnerability to adversarial attacks. It is using the Adversarial Robustness Toolbox (ART) on KServe. ART provides tools that enable developers to evaluate, defend, and verify ML models and applications against adversarial threats. Apart from giving capabilities to craft adversarial attacks , it also provides algorithms to defend against them. We will be using the MNIST dataset which is a dataset of handwritten digits and find adversarial examples which can make the model predict a classification incorrectly, thereby showing the vulnerability of the model against adversarial attacks. Sample resources for deploying the example can be found here To deploy the inferenceservice with v1beta1 API kubectl apply -f art.yaml Then find the url kubectl get inferenceservice NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE artserver http://artserver.somecluster/v1/models/artserver True 100 40m","title":"Using ART to get adversarial examples for MNIST classifications"},{"location":"modelserving/detect/art/mnist/#explanation","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = artserver SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After some time you should see a pop up containing the explanation, similar to the image below. If a pop up does not display and the message \"Unable to find an adversarial example.\" appears then an adversarial example could not be found for the image given in a timely manner. If a pop up does display then the image on the left is the original image and the image on the right is the adversarial example. The labels above both images represent what classification the model made for each individual image. The Square Attack method used in this example creates a random update at each iteration and adds this update to the adversarial input if it makes a misclassification more likely (more specifically, if it improves the objective function). Once enough random updates are added together and the model misclassifies then the resulting adversarial input will be returned and displayed. To try a different MNIST example add an integer to the end of the query between 0-9,999. The integer chosen will be the index of the image to be chosen in the MNIST dataset. Or to try a file with custom data add the file path to the end. Keep in mind that the data format must be {\"instances\": [<image>, <label>]} python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } ./input.json","title":"Explanation"},{"location":"modelserving/detect/art/mnist/#stopping-the-inference-service","text":"kubectl delete -f art.yaml","title":"Stopping the Inference Service"},{"location":"modelserving/detect/art/mnist/#build-a-development-art-explainer-docker-image","text":"If you would like to build a development image for the ART Explainer then follow these instructions","title":"Build a Development ART Explainer Docker Image"},{"location":"modelserving/detect/art/mnist/#troubleshooting","text":"<504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to art.yaml and increase resources. If you see Configuration \"artserver-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"Troubleshooting"},{"location":"modelserving/explainer/explainer/","text":"InferenceService Explainer \u00b6 Model explainability answers the question: \"Why did my model make this prediction\" for a given instance. KServe integrates with Alibi Explainer which implements a black-box algorithm by generating a lot of similar looking instances for a given instance and send out to the model server to produce an explanation. Additionally, KServe also integrates with The AI Explainability 360 (AIX360) toolkit, an LF AI Foundation incubation project, which is an open-source library that supports the interpretability and explainability of datasets and machine learning models. The AI Explainability 360 Python package includes a comprehensive set of algorithms that cover different dimensions of explanations along with proxy explainability metrics. In addition to native algorithms, AIX360 also provides algorithms from LIME and Shap. Explainer Examples Deploy Alibi Image Explainer Imagenet Explainer Deploy Alibi Income Explainer Income Explainer Deploy Alibi Text Explainer Alibi Text Explainer","title":"Concept"},{"location":"modelserving/explainer/explainer/#inferenceservice-explainer","text":"Model explainability answers the question: \"Why did my model make this prediction\" for a given instance. KServe integrates with Alibi Explainer which implements a black-box algorithm by generating a lot of similar looking instances for a given instance and send out to the model server to produce an explanation. Additionally, KServe also integrates with The AI Explainability 360 (AIX360) toolkit, an LF AI Foundation incubation project, which is an open-source library that supports the interpretability and explainability of datasets and machine learning models. The AI Explainability 360 Python package includes a comprehensive set of algorithms that cover different dimensions of explanations along with proxy explainability metrics. In addition to native algorithms, AIX360 also provides algorithms from LIME and Shap. Explainer Examples Deploy Alibi Image Explainer Imagenet Explainer Deploy Alibi Income Explainer Income Explainer Deploy Alibi Text Explainer Alibi Text Explainer","title":"InferenceService Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/","text":"Using AIX to get explanations for MNIST classifications \u00b6 This is an example of how to explain model predictions using AI Explainability 360 (AIX360) on KServe. We will be using mnist dataset for handwritten digits for this model and explain how the model decides the predicted results. Create the InferenceService with AIX Explainer \u00b6 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"aix-explainer\" namespace : default spec : predictor : containers : - name : predictor image : aipipeline/rf-predictor:0.4.1 command : [ \"python\" , \"-m\" , \"rfserver\" , \"--model_name\" , \"aix-explainer\" ] imagePullPolicy : Always explainer : containers : - name : explainer image : kserve/aix-explainer:v0.10.1 args : - --model_name - aix-explainer - --explainer_type - LimeImages - --num_samples - \"100\" - --top_labels - \"10\" - --min_weight - \"0.01\" imagePullPolicy : Always resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : \"1\" memory : 2Gi To deploy the InferenceService with v1beta1 API kubectl kubectl apply -f aix-explainer.yaml Then find the url. kubectl kubectl get inferenceservice aix-explainer NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE aix-explainer http://aix-explainer.default.example.com True 100 aix-explainer-predictor-default-00001 43m Run Explanation \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the example code for model training and explainer client can be found here . MODEL_NAME = aix-explainer SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After a bit of time you should see a pop up containing the explanation, similar to the image below. The LIME method used in this example highlights the pixels in red that score above a certain confidence value for indicating a classification. The explanation shown will contain a collection of images that are highlighted paired with a title to describe the context. For each title and image pair, the title will say Positive for <X> Actual <Y> to denote that is the classification that LIME is testing for and is the correct label for that image. To give an example, the top-left image with the title \"Positive for 2 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 2 (where 2 is also the correct classification). Similarly, the bottom-right image with the title \"Positive for 0 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 0 (where 2 is the correct classification). If the model were to incorrectly classify the image as 0, then you could get an explanation of why by looking at the highlighted pixels as being especially troublesome. By raising and lowering the min_weight parameter in the deployment yaml you can test to see which pixels your model believes are the most and least relevant for each classification. To try a different MNIST example add an integer to the end of the query between 0-10,000. The integer chosen will be the index of the image to be chosen in the MNIST dataset. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 To try different parameters with explainer, add another string json argument to specify the parameters. Supported modified parameters: top_labels, segmentation_alg, num_samples, positive_only, and min_weight. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 '{\"top_labels\":\"10\"}' Stopping the Inference Service \u00b6 kubectl delete -f aix-explainer.yaml Build a Development AIX Model Explainer Docker Image \u00b6 If you would like to build a development image for the AIX Model Explainer then follow these instructions Troubleshooting \u00b6 <504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to aix-explainer.yaml and increase resources. Or to lower the number of allowed samples go to aix-explainer.yaml and add a flag to explainer: command: '--num_samples' (the default number of samples is 1000) If you see Configuration \"aixserver-explainer-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"AIX Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/#using-aix-to-get-explanations-for-mnist-classifications","text":"This is an example of how to explain model predictions using AI Explainability 360 (AIX360) on KServe. We will be using mnist dataset for handwritten digits for this model and explain how the model decides the predicted results.","title":"Using AIX to get explanations for MNIST classifications"},{"location":"modelserving/explainer/aix/mnist/aix/#create-the-inferenceservice-with-aix-explainer","text":"apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"aix-explainer\" namespace : default spec : predictor : containers : - name : predictor image : aipipeline/rf-predictor:0.4.1 command : [ \"python\" , \"-m\" , \"rfserver\" , \"--model_name\" , \"aix-explainer\" ] imagePullPolicy : Always explainer : containers : - name : explainer image : kserve/aix-explainer:v0.10.1 args : - --model_name - aix-explainer - --explainer_type - LimeImages - --num_samples - \"100\" - --top_labels - \"10\" - --min_weight - \"0.01\" imagePullPolicy : Always resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : \"1\" memory : 2Gi To deploy the InferenceService with v1beta1 API kubectl kubectl apply -f aix-explainer.yaml Then find the url. kubectl kubectl get inferenceservice aix-explainer NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE aix-explainer http://aix-explainer.default.example.com True 100 aix-explainer-predictor-default-00001 43m","title":"Create the InferenceService with AIX Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/#run-explanation","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the example code for model training and explainer client can be found here . MODEL_NAME = aix-explainer SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After a bit of time you should see a pop up containing the explanation, similar to the image below. The LIME method used in this example highlights the pixels in red that score above a certain confidence value for indicating a classification. The explanation shown will contain a collection of images that are highlighted paired with a title to describe the context. For each title and image pair, the title will say Positive for <X> Actual <Y> to denote that is the classification that LIME is testing for and is the correct label for that image. To give an example, the top-left image with the title \"Positive for 2 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 2 (where 2 is also the correct classification). Similarly, the bottom-right image with the title \"Positive for 0 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 0 (where 2 is the correct classification). If the model were to incorrectly classify the image as 0, then you could get an explanation of why by looking at the highlighted pixels as being especially troublesome. By raising and lowering the min_weight parameter in the deployment yaml you can test to see which pixels your model believes are the most and least relevant for each classification. To try a different MNIST example add an integer to the end of the query between 0-10,000. The integer chosen will be the index of the image to be chosen in the MNIST dataset. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 To try different parameters with explainer, add another string json argument to specify the parameters. Supported modified parameters: top_labels, segmentation_alg, num_samples, positive_only, and min_weight. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 '{\"top_labels\":\"10\"}'","title":"Run Explanation"},{"location":"modelserving/explainer/aix/mnist/aix/#stopping-the-inference-service","text":"kubectl delete -f aix-explainer.yaml","title":"Stopping the Inference Service"},{"location":"modelserving/explainer/aix/mnist/aix/#build-a-development-aix-model-explainer-docker-image","text":"If you would like to build a development image for the AIX Model Explainer then follow these instructions","title":"Build a Development AIX Model Explainer Docker Image"},{"location":"modelserving/explainer/aix/mnist/aix/#troubleshooting","text":"<504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to aix-explainer.yaml and increase resources. Or to lower the number of allowed samples go to aix-explainer.yaml and add a flag to explainer: command: '--num_samples' (the default number of samples is 1000) If you see Configuration \"aixserver-explainer-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"Troubleshooting"},{"location":"modelserving/explainer/alibi/cifar10/","text":"CIFAR10 Image Classifier Explanations \u00b6 We will use a Tensorflow classifier built on CIFAR10 image dataset which is a 10 class image dataset to show the example of explanation on image data. Create the InferenceService with Alibi Explainer \u00b6 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cifar10\" spec : predictor : tensorflow : storageUri : \"gs://seldon-models/tfserving/cifar10/resnet32\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi explainer : containers : - name : kserve-container image : kserve/alibi-explainer:v0.12.1 args : - --model_name=cifar10 alibi : type : AnchorImages storageUri : \"gs://kfserving-examples/models/tensorflow/cifar/explainer-0.9.1\" config : batch_size : \"40\" stop_on_first : \"True\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi Note The InferenceService resource describes: A pretrained tensorflow model stored on a Google bucket An AnchorImage Seldon Alibi Explainer, see the Alibi Docs for further details. Test on notebook \u00b6 Run this example using the Jupyter notebook . Once created you will be able to test the predictions: And then get an explanation for it:","title":"Image Explainer"},{"location":"modelserving/explainer/alibi/cifar10/#cifar10-image-classifier-explanations","text":"We will use a Tensorflow classifier built on CIFAR10 image dataset which is a 10 class image dataset to show the example of explanation on image data.","title":"CIFAR10 Image Classifier Explanations"},{"location":"modelserving/explainer/alibi/cifar10/#create-the-inferenceservice-with-alibi-explainer","text":"apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cifar10\" spec : predictor : tensorflow : storageUri : \"gs://seldon-models/tfserving/cifar10/resnet32\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi explainer : containers : - name : kserve-container image : kserve/alibi-explainer:v0.12.1 args : - --model_name=cifar10 alibi : type : AnchorImages storageUri : \"gs://kfserving-examples/models/tensorflow/cifar/explainer-0.9.1\" config : batch_size : \"40\" stop_on_first : \"True\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi Note The InferenceService resource describes: A pretrained tensorflow model stored on a Google bucket An AnchorImage Seldon Alibi Explainer, see the Alibi Docs for further details.","title":"Create the InferenceService with Alibi Explainer"},{"location":"modelserving/explainer/alibi/cifar10/#test-on-notebook","text":"Run this example using the Jupyter notebook . Once created you will be able to test the predictions: And then get an explanation for it:","title":"Test on notebook"},{"location":"modelserving/explainer/alibi/income/","text":"Example Anchors Tabular Explaination for Income Prediction \u00b6 This example uses a US income dataset to show the example of explanation on tabular data. You can also try out the Jupyter notebook for a visual walkthrough. Create the InferenceService with alibi explainer \u00b6 We can create a InferenceService with a trained sklearn predictor for this dataset and an associated model explainer. The black box explainer algorithm we will use is the Tabular version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"income\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/model\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorTabular storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/explainer\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 4Gi Create the InferenceService with above yaml: kubectl kubectl create -f income.yaml The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = income SERVICE_HOSTNAME = $( kubectl get inferenceservice income -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Run the inference \u00b6 Test the predictor: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' You should receive the response showing the prediction is for low salary: Expected Output { \"predictions\" : [ 0 ]} Run the explanation \u00b6 Now lets get an explanation for this: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' The returned explanation will be like: Expected Output { \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"precision\" : 0.9724770642201835 , \"coverage\" : 0.0147 , \"raw\" : { \"feature\" : [ 3 , 1 ], \"mean\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"precision\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"coverage\" : [ 0.3327 , 0.0147 ], \"examples\" : [ { \"covered\" : [ [ 30 , \"Self-emp-not-inc\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Unmarried\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 69 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , 9386 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 59 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 55 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 55 , \"United-States\" ], [ 32 , \"?\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 32 , \"United-States\" ], [ 47 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Female\" , 6849 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 65 , \"United-States\" ], [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 48 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"covered_true\" : [ [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 36 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 56 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 49 , \"Local-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 20 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 10 , \"United-States\" ], [ 22 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , \"Hours per week > 45.00\" , \"United-States\" ], [ 29 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Own-child\" , \"Asian-Pac-Islander\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"SE-Asia\" ], [ 45 , \"Local-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Unmarried\" , \"White\" , \"Female\" , 1506 , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 27 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ] ], \"covered_false\" : [ [ 29 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , 7298 , \"Capital Loss <= 0.00\" , 42 , \"United-States\" ], [ 56 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Sales\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 47 , \"Private\" , \"Masters\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 27828 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 40 , \"Private\" , \"Associates\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 7688 , \"Capital Loss <= 0.00\" , 44 , \"United-States\" ], [ 55 , \"Self-emp-not-inc\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , 34095 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 53 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 48 , \"United-States\" ], [ 47 , \"Federal-gov\" , \"Doctorate\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 53 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , 1977 , 40 , \"United-States\" ], [ 46 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 8614 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Not-in-family\" , \"White\" , \"Male\" , 10520 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] }, { \"covered\" : [ [ 41 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 64 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 33 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"Black\" , \"Female\" , 1831 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 25 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Own-child\" , \"Black\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 40 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 19 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Other-relative\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ], [ 44 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 88 , \"United-States\" ], [ 80 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 24 , \"United-States\" ], [ 21 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Professional\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ] ], \"covered_true\" : [ [ 22 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 49 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 22 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 31 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 18 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 56 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 26 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 38 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 52 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 70 , \"United-States\" ], [ 25 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Professional\" , \"Wife\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , 1887 , 40 , \"United-States\" ] ], \"covered_false\" : [ [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 42 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 54 , \"State-gov\" , \"Doctorate\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 42 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , 14084 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 37 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"instance\" : [ [ 39 ], [ 7 ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ 4 ], [ \"28.00 < Age <= 37.00\" ], [ 2174 ], [ \"Age <= 28.00\" ], [ 40 ], [ 9 ] ], \"prediction\" : 0 } }","title":"Income Explainer"},{"location":"modelserving/explainer/alibi/income/#example-anchors-tabular-explaination-for-income-prediction","text":"This example uses a US income dataset to show the example of explanation on tabular data. You can also try out the Jupyter notebook for a visual walkthrough.","title":"Example Anchors Tabular Explaination for Income Prediction"},{"location":"modelserving/explainer/alibi/income/#create-the-inferenceservice-with-alibi-explainer","text":"We can create a InferenceService with a trained sklearn predictor for this dataset and an associated model explainer. The black box explainer algorithm we will use is the Tabular version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"income\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/model\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorTabular storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/explainer\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 4Gi Create the InferenceService with above yaml: kubectl kubectl create -f income.yaml The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = income SERVICE_HOSTNAME = $( kubectl get inferenceservice income -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Create the InferenceService with alibi explainer"},{"location":"modelserving/explainer/alibi/income/#run-the-inference","text":"Test the predictor: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' You should receive the response showing the prediction is for low salary: Expected Output { \"predictions\" : [ 0 ]}","title":"Run the inference"},{"location":"modelserving/explainer/alibi/income/#run-the-explanation","text":"Now lets get an explanation for this: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' The returned explanation will be like: Expected Output { \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"precision\" : 0.9724770642201835 , \"coverage\" : 0.0147 , \"raw\" : { \"feature\" : [ 3 , 1 ], \"mean\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"precision\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"coverage\" : [ 0.3327 , 0.0147 ], \"examples\" : [ { \"covered\" : [ [ 30 , \"Self-emp-not-inc\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Unmarried\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 69 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , 9386 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 59 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 55 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 55 , \"United-States\" ], [ 32 , \"?\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 32 , \"United-States\" ], [ 47 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Female\" , 6849 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 65 , \"United-States\" ], [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 48 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"covered_true\" : [ [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 36 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 56 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 49 , \"Local-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 20 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 10 , \"United-States\" ], [ 22 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , \"Hours per week > 45.00\" , \"United-States\" ], [ 29 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Own-child\" , \"Asian-Pac-Islander\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"SE-Asia\" ], [ 45 , \"Local-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Unmarried\" , \"White\" , \"Female\" , 1506 , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 27 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ] ], \"covered_false\" : [ [ 29 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , 7298 , \"Capital Loss <= 0.00\" , 42 , \"United-States\" ], [ 56 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Sales\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 47 , \"Private\" , \"Masters\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 27828 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 40 , \"Private\" , \"Associates\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 7688 , \"Capital Loss <= 0.00\" , 44 , \"United-States\" ], [ 55 , \"Self-emp-not-inc\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , 34095 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 53 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 48 , \"United-States\" ], [ 47 , \"Federal-gov\" , \"Doctorate\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 53 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , 1977 , 40 , \"United-States\" ], [ 46 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 8614 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Not-in-family\" , \"White\" , \"Male\" , 10520 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] }, { \"covered\" : [ [ 41 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 64 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 33 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"Black\" , \"Female\" , 1831 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 25 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Own-child\" , \"Black\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 40 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 19 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Other-relative\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ], [ 44 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 88 , \"United-States\" ], [ 80 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 24 , \"United-States\" ], [ 21 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Professional\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ] ], \"covered_true\" : [ [ 22 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 49 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 22 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 31 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 18 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 56 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 26 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 38 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 52 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 70 , \"United-States\" ], [ 25 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Professional\" , \"Wife\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , 1887 , 40 , \"United-States\" ] ], \"covered_false\" : [ [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 42 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 54 , \"State-gov\" , \"Doctorate\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 42 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , 14084 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 37 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"instance\" : [ [ 39 ], [ 7 ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ 4 ], [ \"28.00 < Age <= 37.00\" ], [ 2174 ], [ \"Age <= 28.00\" ], [ 40 ], [ 9 ] ], \"prediction\" : 0 } }","title":"Run the explanation"},{"location":"modelserving/explainer/alibi/moviesentiment/","text":"Example Anchors Text Explaination for Movie Sentiment \u00b6 This example uses a movie sentiment dataset to show the explanation on text data, for a more visual walkthrough please try the Jupyter notebook . Deploy InferenceService with AnchorText Explainer \u00b6 We can create a InferenceService with a trained sklearn predictor for this dataset and an associated explainer. The black box explainer algorithm we will use is the Text version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorText resources : requests : cpu : 0.1 memory : 6Gi limits : memory : 6Gi Create this InferenceService: kubectl kubectl create -f moviesentiment.yaml Run Inference and Explanation \u00b6 Set up some environment variables for the model name and cluster entrypoint. MODEL_NAME = moviesentiment SERVICE_HOSTNAME = $( kubectl get inferenceservice moviesentiment -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Test the predictor on an example sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' You should receive the response showing negative sentiment: Expected Output { \"predictions\" : [ 0 ]} Test on another sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a touching , sophisticated film that almost seems like a documentary in the way it captures an italian immigrant family on the brink of major changes .\"]}' You should receive the response showing positive sentiment: Expected Output { \"predictions\" : [ 1 ]} Now lets get an explanation for the first sentence: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 1 , \"converage\" : 0.5005 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 1 ], \"precision\" : [ 1 ], \"coverage\" : [ 0.5005 ], \"examples\" : [ { \"covered\" : [ [ \"a visually UNK UNK UNK opaque and emotionally vapid exercise UNK\" ], [ \"a visually flashy but UNK UNK and emotionally UNK exercise .\" ], [ \"a visually flashy but narratively UNK UNK UNK UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"UNK UNK UNK but UNK opaque UNK emotionally UNK exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK emotionally UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise UNK\" ], [ \"a visually UNK but narratively opaque UNK UNK vapid exercise UNK\" ] ], \"covered_true\" : [ [ \"UNK visually flashy but UNK UNK and emotionally vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK UNK exercise .\" ], [ \"a UNK UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a visually UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a UNK UNK UNK UNK UNK and emotionally vapid exercise UNK\" ], [ \"a UNK flashy UNK narratively UNK and UNK vapid exercise UNK\" ], [ \"UNK visually UNK UNK narratively UNK and emotionally UNK exercise .\" ], [ \"UNK visually flashy UNK narratively opaque UNK emotionally UNK exercise UNK\" ], [ \"UNK UNK flashy UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ] ], \"covered_false\" : [], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } This shows the key word \"bad\" was identified and examples show it in context using the default \"UKN\" placeholder for surrounding words. Custom Configuration \u00b6 You can add custom configuration for the Anchor Text explainer in the 'config' section. For example, we can change the text explainer to sample from the corpus rather than use UKN placeholders: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 explainer : alibi : type : AnchorText config : use_unk : \"false\" sample_proba : \"0.5\" resources : requests : cpu : 0.1 If we apply this: kubectl kubectl create -f moviesentiment2.yaml and then ask for an explanation: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 0.9918032786885246 , \"coverage\" : 0.5072 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 0.9918032786885246 ], \"precision\" : [ 0.9918032786885246 ], \"coverage\" : [ 0.5072 ], \"examples\" : [ { \"covered\" : [ [ \"each visually playful but enormously opaque and academically vapid exercise .\" ], [ \"each academically trashy but narratively pigmented and profoundly vapid exercise .\" ], [ \"a masterfully flashy but narratively straightforward and verbally disingenuous exercise .\" ], [ \"a visually gaudy but interestingly opaque and emotionally vapid exercise .\" ], [ \"some concurrently flashy but philosophically voxel and emotionally vapid exercise .\" ], [ \"a visually flashy but delightfully sensible and emotionally snobby exercise .\" ], [ \"a surprisingly bland but fantastically seamless and hideously vapid exercise .\" ], [ \"both visually classy but nonetheless robust and musically vapid exercise .\" ], [ \"a visually fancy but narratively robust and emotionally uninformed exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ] ], \"covered_true\" : [ [ \"another visually flashy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually classy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually arty but overshadow yellowish and emotionally vapid exercise .\" ], [ \"a objectively flashy but genuinely straightforward and emotionally vapid exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ], [ \"a emotionally crafty but narratively opaque and emotionally vapid exercise .\" ], [ \"some similarly eclectic but narratively dainty and emotionally illogical exercise .\" ], [ \"a nicely flashy but psychologically opaque and emotionally vapid exercise .\" ], [ \"a visually flashy but narratively colorless and emotionally vapid exercise .\" ], [ \"every properly lavish but logistically opaque and someway incomprehensible exercise .\" ] ], \"covered_false\" : [ [ \"another enormously inventive but socially opaque and somewhat idiotic exercise .\" ], [ \"each visually playful but enormously opaque and academically vapid exercise .\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } Run on Notebook \u00b6 You can also run this example on notebook","title":"Text Explainer"},{"location":"modelserving/explainer/alibi/moviesentiment/#example-anchors-text-explaination-for-movie-sentiment","text":"This example uses a movie sentiment dataset to show the explanation on text data, for a more visual walkthrough please try the Jupyter notebook .","title":"Example Anchors Text Explaination for Movie Sentiment"},{"location":"modelserving/explainer/alibi/moviesentiment/#deploy-inferenceservice-with-anchortext-explainer","text":"We can create a InferenceService with a trained sklearn predictor for this dataset and an associated explainer. The black box explainer algorithm we will use is the Text version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorText resources : requests : cpu : 0.1 memory : 6Gi limits : memory : 6Gi Create this InferenceService: kubectl kubectl create -f moviesentiment.yaml","title":"Deploy InferenceService with AnchorText Explainer"},{"location":"modelserving/explainer/alibi/moviesentiment/#run-inference-and-explanation","text":"Set up some environment variables for the model name and cluster entrypoint. MODEL_NAME = moviesentiment SERVICE_HOSTNAME = $( kubectl get inferenceservice moviesentiment -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Test the predictor on an example sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' You should receive the response showing negative sentiment: Expected Output { \"predictions\" : [ 0 ]} Test on another sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a touching , sophisticated film that almost seems like a documentary in the way it captures an italian immigrant family on the brink of major changes .\"]}' You should receive the response showing positive sentiment: Expected Output { \"predictions\" : [ 1 ]} Now lets get an explanation for the first sentence: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 1 , \"converage\" : 0.5005 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 1 ], \"precision\" : [ 1 ], \"coverage\" : [ 0.5005 ], \"examples\" : [ { \"covered\" : [ [ \"a visually UNK UNK UNK opaque and emotionally vapid exercise UNK\" ], [ \"a visually flashy but UNK UNK and emotionally UNK exercise .\" ], [ \"a visually flashy but narratively UNK UNK UNK UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"UNK UNK UNK but UNK opaque UNK emotionally UNK exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK emotionally UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise UNK\" ], [ \"a visually UNK but narratively opaque UNK UNK vapid exercise UNK\" ] ], \"covered_true\" : [ [ \"UNK visually flashy but UNK UNK and emotionally vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK UNK exercise .\" ], [ \"a UNK UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a visually UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a UNK UNK UNK UNK UNK and emotionally vapid exercise UNK\" ], [ \"a UNK flashy UNK narratively UNK and UNK vapid exercise UNK\" ], [ \"UNK visually UNK UNK narratively UNK and emotionally UNK exercise .\" ], [ \"UNK visually flashy UNK narratively opaque UNK emotionally UNK exercise UNK\" ], [ \"UNK UNK flashy UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ] ], \"covered_false\" : [], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } This shows the key word \"bad\" was identified and examples show it in context using the default \"UKN\" placeholder for surrounding words.","title":"Run Inference and Explanation"},{"location":"modelserving/explainer/alibi/moviesentiment/#custom-configuration","text":"You can add custom configuration for the Anchor Text explainer in the 'config' section. For example, we can change the text explainer to sample from the corpus rather than use UKN placeholders: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 explainer : alibi : type : AnchorText config : use_unk : \"false\" sample_proba : \"0.5\" resources : requests : cpu : 0.1 If we apply this: kubectl kubectl create -f moviesentiment2.yaml and then ask for an explanation: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 0.9918032786885246 , \"coverage\" : 0.5072 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 0.9918032786885246 ], \"precision\" : [ 0.9918032786885246 ], \"coverage\" : [ 0.5072 ], \"examples\" : [ { \"covered\" : [ [ \"each visually playful but enormously opaque and academically vapid exercise .\" ], [ \"each academically trashy but narratively pigmented and profoundly vapid exercise .\" ], [ \"a masterfully flashy but narratively straightforward and verbally disingenuous exercise .\" ], [ \"a visually gaudy but interestingly opaque and emotionally vapid exercise .\" ], [ \"some concurrently flashy but philosophically voxel and emotionally vapid exercise .\" ], [ \"a visually flashy but delightfully sensible and emotionally snobby exercise .\" ], [ \"a surprisingly bland but fantastically seamless and hideously vapid exercise .\" ], [ \"both visually classy but nonetheless robust and musically vapid exercise .\" ], [ \"a visually fancy but narratively robust and emotionally uninformed exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ] ], \"covered_true\" : [ [ \"another visually flashy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually classy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually arty but overshadow yellowish and emotionally vapid exercise .\" ], [ \"a objectively flashy but genuinely straightforward and emotionally vapid exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ], [ \"a emotionally crafty but narratively opaque and emotionally vapid exercise .\" ], [ \"some similarly eclectic but narratively dainty and emotionally illogical exercise .\" ], [ \"a nicely flashy but psychologically opaque and emotionally vapid exercise .\" ], [ \"a visually flashy but narratively colorless and emotionally vapid exercise .\" ], [ \"every properly lavish but logistically opaque and someway incomprehensible exercise .\" ] ], \"covered_false\" : [ [ \"another enormously inventive but socially opaque and somewhat idiotic exercise .\" ], [ \"each visually playful but enormously opaque and academically vapid exercise .\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } }","title":"Custom Configuration"},{"location":"modelserving/explainer/alibi/moviesentiment/#run-on-notebook","text":"You can also run this example on notebook","title":"Run on Notebook"},{"location":"modelserving/explainer/trustyai/","text":"TrustyAI explainer \u00b6 This is an example of how to use the TrustyAI project 's KServe custom explainer. The TrustyAI KServe explainer includes two explainer types: LIME (Local Interpretable Model-agnostic Explanations) and SHAP (SHapley Additive exPlanations) supporting tabular data models. In this example, we will use the \"California Housing Dataset\". This dataset is available as part of scikit-learn and targets the median house value for California districts (scaled to 100k USD units), from eight input features including median income, average number of rooms per household and block group population. For example purposes, we trained a Random Forest model. To deploy it and enable the TrustyAI explainer, use the following InferenceService apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"housing\" spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/trustyai-explainability/model-collection/raw/main/housing-data/model.joblib explainer : containers : - name : explainer image : quay.io/trustyai/trustyai-kserve-explainer:latest kubectl export NAMESPACE = \"kserve-explainer\" kubectl create namespace ${ NAMESPACE } kubectl create -f housing.yaml -n ${ NAMESPACE } Expected Output $ namespace/kserve-explainer created $ inferenceservice.serving.kserve.io/housing created To verify that the InferenceService is deployed you can run: kubectl kubectl get pods -n ${ NAMESPACE } Expected Output NAME READY STATUS RESTARTS AGE housing-explainer-00001-deployment-75c56fdc65-wc2p5 2 /2 Running 0 4m13s housing-predictor-00001-deployment-85fc685954-lp8z2 2 /2 Running 0 4m13s Once the InferenceService is deployed and ready, we can start by issuing an inference request. Here we will assume that the gateway service is available via port-forward, for simplicity. Port Forward Alternatively you can do Port Forward for testing purposes. INGRESS_GATEWAY_SERVICE = $( kubectl get svc --namespace istio-system --selector = \"app=istio-ingressgateway\" --output jsonpath = '{.items[0].metadata.name}' ) kubectl port-forward --namespace istio-system svc/ ${ INGRESS_GATEWAY_SERVICE } 8080 :80 Open another terminal, and enter the following: export INGRESS_HOST = localhost export INGRESS_PORT = 8080 export SERVICE_HOSTNAME = $( kubectl get inferenceservice housing -n $NAMESPACE -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) We can create a payload JSON file payload.json with the following contents as an example: { \"instances\" : [ [ 6.6227 , 20.0 , 6.282147315855181 , 1.0087390761548065 , 2695.0 , 3.3645443196004994 , 37.42 , -121.86 ] ] } curl -sv -X POST -H \"Content-Type: application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/housing:predict \\ -d @payload.json We will get the following result, indicating a predicted house value of approximately $291k. Expected Output { \"predictions\" :[ 2.9168394017053823 ]} Requesting explanations \u00b6 By default, the TrustyAI explainer returns both a SHAP and a LIME explanation. These explanations will consist of a feature saliency map in the case of LIME, and a breakdown of individual feature contributions to the final result (relative to a dataset background value) for SHAP. The way in which the TrustyAI explainer creates the background dataset (especially in the \"cold start\" case in detailed in the SHAP background generation section). We can request the explanation by using the same payload and the :explain endpoint. curl -sv -X POST -H \"Content-Type: application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/housing:explain \\ -d @payload.json Expected Output { \"timestamp\" : \"2024-08-14T10:39:25.439+00:00\" , \"type\" : \"explanation\" , \"saliencies\" : { \"LIME\" : { \"outputs-0\" : [ { \"name\" : \"inputs-4\" , \"score\" : 0.8752282972400102 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-1\" , \"score\" : 0.8510641096679439 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-7\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-5\" , \"score\" : 0.06190946209542338 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-3\" , \"score\" : 0.045357719680479414 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-2\" , \"score\" : 0.04814487416008339 , \"confidence\" : 0.0 } ] }, \"SHAP\" : { \"outputs-0\" : [ { \"name\" : \"inputs-0\" , \"score\" : -0.0883804018056985 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-1\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-2\" , \"score\" : 0.05919873703962664 , \"confidence\" : 0.04304153489780624 }, { \"name\" : \"inputs-3\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-4\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-5\" , \"score\" : -0.2214697499218062 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-6\" , \"score\" : 0.056336605210644264 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-7\" , \"score\" : 0.054585492932784946 , \"confidence\" : 0.0860830697956125 }, { \"name\" : \"Background\" , \"score\" : 3.056568718249831 , \"confidence\" : 0.0 } ] } } } Additional configuration \u00b6 Additional explainer configuration can be made via environment variables on the InferenceService . For instance, to have only SHAP explanations we could use: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"housing\" spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/trustyai-explainability/model-collection/raw/main/housing-data/model.joblib explainer : containers : - name : explainer image : quay.io/trustyai/trustyai-kserve-explainer:latest env : - name : EXPLAINER_TYPE value : \"SHAP\" To have only LIME, simply replace \"SHAP\" with \"LIME\" . Through environment variables, we can also configure several explainer parameters such as Number of samples used by the explainers to explore the decision boundary Use of weighted liner regressions Weight normalization SHAP background data size SHAP backbround diversity A full list of the available configuration options is available at the explainer's repository . SHAP background generation \u00b6 For SHAP to produce meaningful explanations, it requires a diverse set of baseline data (or \"background\" data). This can be problematic in the \"cold start\" scenario, where very few observations are available. The TrustyAI explainer tries to mitigate this problem by keeping a fixed size storage of past data (size configurable via EXPLAINER_SHAP_BACKGROUND_QUEUE ) and populating the missing observations that make up the queue with samples from an empirical distribution created from the data observed so far. The more data is passed to the InferenceService , the less the explainer will rely on synthetic data, keeping only a number of synthetic samples for diversity purposes (configurable with EXPLAINER_SHAP_BACKGROUND_DIVERSITY ).","title":"TrustyAI Explainer"},{"location":"modelserving/explainer/trustyai/#trustyai-explainer","text":"This is an example of how to use the TrustyAI project 's KServe custom explainer. The TrustyAI KServe explainer includes two explainer types: LIME (Local Interpretable Model-agnostic Explanations) and SHAP (SHapley Additive exPlanations) supporting tabular data models. In this example, we will use the \"California Housing Dataset\". This dataset is available as part of scikit-learn and targets the median house value for California districts (scaled to 100k USD units), from eight input features including median income, average number of rooms per household and block group population. For example purposes, we trained a Random Forest model. To deploy it and enable the TrustyAI explainer, use the following InferenceService apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"housing\" spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/trustyai-explainability/model-collection/raw/main/housing-data/model.joblib explainer : containers : - name : explainer image : quay.io/trustyai/trustyai-kserve-explainer:latest kubectl export NAMESPACE = \"kserve-explainer\" kubectl create namespace ${ NAMESPACE } kubectl create -f housing.yaml -n ${ NAMESPACE } Expected Output $ namespace/kserve-explainer created $ inferenceservice.serving.kserve.io/housing created To verify that the InferenceService is deployed you can run: kubectl kubectl get pods -n ${ NAMESPACE } Expected Output NAME READY STATUS RESTARTS AGE housing-explainer-00001-deployment-75c56fdc65-wc2p5 2 /2 Running 0 4m13s housing-predictor-00001-deployment-85fc685954-lp8z2 2 /2 Running 0 4m13s Once the InferenceService is deployed and ready, we can start by issuing an inference request. Here we will assume that the gateway service is available via port-forward, for simplicity. Port Forward Alternatively you can do Port Forward for testing purposes. INGRESS_GATEWAY_SERVICE = $( kubectl get svc --namespace istio-system --selector = \"app=istio-ingressgateway\" --output jsonpath = '{.items[0].metadata.name}' ) kubectl port-forward --namespace istio-system svc/ ${ INGRESS_GATEWAY_SERVICE } 8080 :80 Open another terminal, and enter the following: export INGRESS_HOST = localhost export INGRESS_PORT = 8080 export SERVICE_HOSTNAME = $( kubectl get inferenceservice housing -n $NAMESPACE -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) We can create a payload JSON file payload.json with the following contents as an example: { \"instances\" : [ [ 6.6227 , 20.0 , 6.282147315855181 , 1.0087390761548065 , 2695.0 , 3.3645443196004994 , 37.42 , -121.86 ] ] } curl -sv -X POST -H \"Content-Type: application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/housing:predict \\ -d @payload.json We will get the following result, indicating a predicted house value of approximately $291k. Expected Output { \"predictions\" :[ 2.9168394017053823 ]}","title":"TrustyAI explainer"},{"location":"modelserving/explainer/trustyai/#requesting-explanations","text":"By default, the TrustyAI explainer returns both a SHAP and a LIME explanation. These explanations will consist of a feature saliency map in the case of LIME, and a breakdown of individual feature contributions to the final result (relative to a dataset background value) for SHAP. The way in which the TrustyAI explainer creates the background dataset (especially in the \"cold start\" case in detailed in the SHAP background generation section). We can request the explanation by using the same payload and the :explain endpoint. curl -sv -X POST -H \"Content-Type: application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/housing:explain \\ -d @payload.json Expected Output { \"timestamp\" : \"2024-08-14T10:39:25.439+00:00\" , \"type\" : \"explanation\" , \"saliencies\" : { \"LIME\" : { \"outputs-0\" : [ { \"name\" : \"inputs-4\" , \"score\" : 0.8752282972400102 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-1\" , \"score\" : 0.8510641096679439 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-7\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-5\" , \"score\" : 0.06190946209542338 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-3\" , \"score\" : 0.045357719680479414 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-2\" , \"score\" : 0.04814487416008339 , \"confidence\" : 0.0 } ] }, \"SHAP\" : { \"outputs-0\" : [ { \"name\" : \"inputs-0\" , \"score\" : -0.0883804018056985 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-1\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-2\" , \"score\" : 0.05919873703962664 , \"confidence\" : 0.04304153489780624 }, { \"name\" : \"inputs-3\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-4\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-5\" , \"score\" : -0.2214697499218062 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-6\" , \"score\" : 0.056336605210644264 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-7\" , \"score\" : 0.054585492932784946 , \"confidence\" : 0.0860830697956125 }, { \"name\" : \"Background\" , \"score\" : 3.056568718249831 , \"confidence\" : 0.0 } ] } } }","title":"Requesting explanations"},{"location":"modelserving/explainer/trustyai/#additional-configuration","text":"Additional explainer configuration can be made via environment variables on the InferenceService . For instance, to have only SHAP explanations we could use: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"housing\" spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/trustyai-explainability/model-collection/raw/main/housing-data/model.joblib explainer : containers : - name : explainer image : quay.io/trustyai/trustyai-kserve-explainer:latest env : - name : EXPLAINER_TYPE value : \"SHAP\" To have only LIME, simply replace \"SHAP\" with \"LIME\" . Through environment variables, we can also configure several explainer parameters such as Number of samples used by the explainers to explore the decision boundary Use of weighted liner regressions Weight normalization SHAP background data size SHAP backbround diversity A full list of the available configuration options is available at the explainer's repository .","title":"Additional configuration"},{"location":"modelserving/explainer/trustyai/#shap-background-generation","text":"For SHAP to produce meaningful explanations, it requires a diverse set of baseline data (or \"background\" data). This can be problematic in the \"cold start\" scenario, where very few observations are available. The TrustyAI explainer tries to mitigate this problem by keeping a fixed size storage of past data (size configurable via EXPLAINER_SHAP_BACKGROUND_QUEUE ) and populating the missing observations that make up the queue with samples from an empirical distribution created from the data observed so far. The more data is passed to the InferenceService , the less the explainer will rely on synthetic data, keeping only a number of synthetic samples for diversity purposes (configurable with EXPLAINER_SHAP_BACKGROUND_DIVERSITY ).","title":"SHAP background generation"},{"location":"modelserving/inference_graph/","text":"Inference Graph \u00b6 Motivation \u00b6 ML inference systems are getting bigger and more complex, they often consist of many models to make a single prediction. Some common use cases are image classification and natural language processing pipelines. For example, a face recognition pipeline may need to first locate faces in a image, then compute the features of the faces to match records in a database. An NLP pipeline needs to first run document classification, then perform named entity detection downstream based on the previous classification results. KServe has unique strengths for building a distributed inference graph: an autoscaling graph router, native integration with individual InferenceServices , and a standard inference protocol for chaining models. KServe leverages these strengths to build an InferenceGraph and enable users to deploy complex ML inference pipelines to production in a declarative and scalable way. Concepts \u00b6 InferenceGraph : Made up of a list of routing Nodes , where each Node consists of a set of routing Steps . Each Step can either route to an InferenceService or another Node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of Routing Nodes : Sequence , Switch , Ensemble , Splitter . Sequence Node : Allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step can be passed to the next step as input based on configuration. Switch Node : Enables users to define routing conditions and select a step to execute if it matches a condition. The response is returned as soon it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combining the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : Allows users to split the traffic to multiple targets using a weighted distribution. Features \u00b6 Headers Propagation \u00b6 If you want Inference Graph's router to propagate the headers, you passed in the request to Inference Graph, to all the steps in your graph then you can do so using inferenceservice-config config-map in kserve namespace. For example: If you want to propagate a certain header, say \"Custom-Header\", then you can edit the router section of inferenceservice-config config-map like this : { \"image\" : \"kserve/router:v0.11.0\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"headers\" : { \"propagate\" :[ \"Custom-Header\" ] } } Once you update this config-map, kserve controller will automatically reconcile Inference Graph to start propagating headers.","title":"Concept"},{"location":"modelserving/inference_graph/#inference-graph","text":"","title":"Inference Graph"},{"location":"modelserving/inference_graph/#motivation","text":"ML inference systems are getting bigger and more complex, they often consist of many models to make a single prediction. Some common use cases are image classification and natural language processing pipelines. For example, a face recognition pipeline may need to first locate faces in a image, then compute the features of the faces to match records in a database. An NLP pipeline needs to first run document classification, then perform named entity detection downstream based on the previous classification results. KServe has unique strengths for building a distributed inference graph: an autoscaling graph router, native integration with individual InferenceServices , and a standard inference protocol for chaining models. KServe leverages these strengths to build an InferenceGraph and enable users to deploy complex ML inference pipelines to production in a declarative and scalable way.","title":"Motivation"},{"location":"modelserving/inference_graph/#concepts","text":"InferenceGraph : Made up of a list of routing Nodes , where each Node consists of a set of routing Steps . Each Step can either route to an InferenceService or another Node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of Routing Nodes : Sequence , Switch , Ensemble , Splitter . Sequence Node : Allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step can be passed to the next step as input based on configuration. Switch Node : Enables users to define routing conditions and select a step to execute if it matches a condition. The response is returned as soon it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combining the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : Allows users to split the traffic to multiple targets using a weighted distribution.","title":"Concepts"},{"location":"modelserving/inference_graph/#features","text":"","title":"Features"},{"location":"modelserving/inference_graph/#headers-propagation","text":"If you want Inference Graph's router to propagate the headers, you passed in the request to Inference Graph, to all the steps in your graph then you can do so using inferenceservice-config config-map in kserve namespace. For example: If you want to propagate a certain header, say \"Custom-Header\", then you can edit the router section of inferenceservice-config config-map like this : { \"image\" : \"kserve/router:v0.11.0\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"headers\" : { \"propagate\" :[ \"Custom-Header\" ] } } Once you update this config-map, kserve controller will automatically reconcile Inference Graph to start propagating headers.","title":"Headers Propagation"},{"location":"modelserving/inference_graph/image_pipeline/","text":"Deploy Image Processing Inference pipeline with InferenceGraph \u00b6 The tutorial demonstrates how to deploy an image processing inference pipeline with multiple stages using InferenceGraph . The example chains the two models, the first model is to classify if an image is a dog or a cat, if it is a dog the second model then does the dog breed classification. InferenceGraph Flow \u00b6 In the InferenceGraph request flow, the image is encoded with base64 format and first sent to the dog-cat-classifier model, the image input for the dog-cat-classifier InferenceService are then forwarded to send to the model on the next stage to classify the breed if the previous model prediction is a dog. Deploy the individual InferenceServices \u00b6 Train the models \u00b6 You can refer to dog-cat classification and dog breed classification to train the image classifier models for different stages. Deploy the InferenceServices \u00b6 Before deploying the graph router with InferenceGraph custom resource, you need to first deploy the individual InferenceServices with the models trained from previous step. The models should be packaged with the following commands and then upload to your model storage along with the configuration : torch-model-archiver -f --model-name cat_dog_classification --version 1 .0 \\ --model-file cat_dog_classification_arch.py \\ --serialized-file cat_dog_classification.pth \\ --handler cat_dog_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store torch-model-archiver -f --model-name dog_breed_classification --version 1 .0 \\ --model-file dog_breed_classification_arch.py \\ --serialized-file dog_breed_classification.pth \\ --handler dog_breed_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store You can then deploy the models to KServe with following InferenceService custom resources. New Schema Old Schema kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"cat-dog-classifier\" spec: predictor: model: modelFormat: name: pytorch resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"dog-breed-classifier\" spec: predictor: model: modelFormat: name: pytorch resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/dog_breed_classification EOF kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"cat-dog-classifier\" spec: predictor: pytorch: resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"dog-breed-classifier\" spec: predictor: pytorch: resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/dog_breed_classification EOF Please check more details on PyTorch Tutorial for how to package the model and deploy with InferenceService . Deploy InferenceGraph \u00b6 After the InferenceServices are in ready state, you can now deploy the InferenceGraph to chain these two models to produce the final inference result. InferenceGraph kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1alpha1\" kind: \"InferenceGraph\" metadata: name: \"dog-breed-pipeline\" spec: nodes: root: routerType: Sequence steps: - serviceName: cat-dog-classifier name: cat_dog_classifier # step name - serviceName: dog-breed-classifier name: dog_breed_classifier data: $request condition: \"[@this].#(predictions.0==\\\"dog\\\")\" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi EOF Note For more information on InferenceGraph Spec, See the reference docs . The InferenceGraph defines the two steps and each step targets the InferenceServices deployed above. The steps are executed in sequence: it first sends the image as request to cat-dog-classifier model and then send to the dog-breed-classifier if it is classified as a dog from the first model. Note that $request is specified on the data field to indicate that you want to forward the request from the previous step and send as input to the next step. condition is specified on the second step so that the request is only sent to the current step if the response data matches the defined condition. When the condition is not matched the graph short circuits and returns the response from the previous step. Refer to gjson syntax for how to express the condition and currently KServe only supports this with REST protocol. Test the InferenceGraph \u00b6 Before testing the InferenceGraph , first check if the graph is in the ready state and then get the router url for sending the request. kubectl get ig dog-breed-pipeline NAME URL READY AGE dog-breed-pipeline http://dog-breed-pipeline.default.example.com True 17h The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Now, you can test the inference graph by sending the cat and dog image data . SERVICE_HOSTNAME = $( kubectl get inferencegraph dog-breed-pipeline -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } -d @./cat.json Expected Output { \"predictions\" : [ \"It's a cat!\" ]} curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } -d @./dog.json Expected Output { \"predictions\" : [{ \"Kuvasz\" : 0.9854059219360352 , \"American_water_spaniel\" : 0.006928909569978714 , \"Glen_of_imaal_terrier\" : 0.004635687451809645 , \"Manchester_terrier\" : 0.0011041086399927735 , \"American_eskimo_dog\" : 0.0003261661622673273 }]} You can see that if the first model classifies the image as dog it then sends to the second model and further classifies the dog breed, if the image is classified as cat the InferenceGraph router returns the response from the first model.","title":"Image classification inference graph"},{"location":"modelserving/inference_graph/image_pipeline/#deploy-image-processing-inference-pipeline-with-inferencegraph","text":"The tutorial demonstrates how to deploy an image processing inference pipeline with multiple stages using InferenceGraph . The example chains the two models, the first model is to classify if an image is a dog or a cat, if it is a dog the second model then does the dog breed classification.","title":"Deploy Image Processing Inference pipeline with InferenceGraph"},{"location":"modelserving/inference_graph/image_pipeline/#inferencegraph-flow","text":"In the InferenceGraph request flow, the image is encoded with base64 format and first sent to the dog-cat-classifier model, the image input for the dog-cat-classifier InferenceService are then forwarded to send to the model on the next stage to classify the breed if the previous model prediction is a dog.","title":"InferenceGraph Flow"},{"location":"modelserving/inference_graph/image_pipeline/#deploy-the-individual-inferenceservices","text":"","title":"Deploy the individual InferenceServices"},{"location":"modelserving/inference_graph/image_pipeline/#train-the-models","text":"You can refer to dog-cat classification and dog breed classification to train the image classifier models for different stages.","title":"Train the models"},{"location":"modelserving/inference_graph/image_pipeline/#deploy-the-inferenceservices","text":"Before deploying the graph router with InferenceGraph custom resource, you need to first deploy the individual InferenceServices with the models trained from previous step. The models should be packaged with the following commands and then upload to your model storage along with the configuration : torch-model-archiver -f --model-name cat_dog_classification --version 1 .0 \\ --model-file cat_dog_classification_arch.py \\ --serialized-file cat_dog_classification.pth \\ --handler cat_dog_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store torch-model-archiver -f --model-name dog_breed_classification --version 1 .0 \\ --model-file dog_breed_classification_arch.py \\ --serialized-file dog_breed_classification.pth \\ --handler dog_breed_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store You can then deploy the models to KServe with following InferenceService custom resources. New Schema Old Schema kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"cat-dog-classifier\" spec: predictor: model: modelFormat: name: pytorch resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"dog-breed-classifier\" spec: predictor: model: modelFormat: name: pytorch resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/dog_breed_classification EOF kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"cat-dog-classifier\" spec: predictor: pytorch: resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"dog-breed-classifier\" spec: predictor: pytorch: resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/dog_breed_classification EOF Please check more details on PyTorch Tutorial for how to package the model and deploy with InferenceService .","title":"Deploy the InferenceServices"},{"location":"modelserving/inference_graph/image_pipeline/#deploy-inferencegraph","text":"After the InferenceServices are in ready state, you can now deploy the InferenceGraph to chain these two models to produce the final inference result. InferenceGraph kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1alpha1\" kind: \"InferenceGraph\" metadata: name: \"dog-breed-pipeline\" spec: nodes: root: routerType: Sequence steps: - serviceName: cat-dog-classifier name: cat_dog_classifier # step name - serviceName: dog-breed-classifier name: dog_breed_classifier data: $request condition: \"[@this].#(predictions.0==\\\"dog\\\")\" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi EOF Note For more information on InferenceGraph Spec, See the reference docs . The InferenceGraph defines the two steps and each step targets the InferenceServices deployed above. The steps are executed in sequence: it first sends the image as request to cat-dog-classifier model and then send to the dog-breed-classifier if it is classified as a dog from the first model. Note that $request is specified on the data field to indicate that you want to forward the request from the previous step and send as input to the next step. condition is specified on the second step so that the request is only sent to the current step if the response data matches the defined condition. When the condition is not matched the graph short circuits and returns the response from the previous step. Refer to gjson syntax for how to express the condition and currently KServe only supports this with REST protocol.","title":"Deploy InferenceGraph"},{"location":"modelserving/inference_graph/image_pipeline/#test-the-inferencegraph","text":"Before testing the InferenceGraph , first check if the graph is in the ready state and then get the router url for sending the request. kubectl get ig dog-breed-pipeline NAME URL READY AGE dog-breed-pipeline http://dog-breed-pipeline.default.example.com True 17h The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Now, you can test the inference graph by sending the cat and dog image data . SERVICE_HOSTNAME = $( kubectl get inferencegraph dog-breed-pipeline -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } -d @./cat.json Expected Output { \"predictions\" : [ \"It's a cat!\" ]} curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } -d @./dog.json Expected Output { \"predictions\" : [{ \"Kuvasz\" : 0.9854059219360352 , \"American_water_spaniel\" : 0.006928909569978714 , \"Glen_of_imaal_terrier\" : 0.004635687451809645 , \"Manchester_terrier\" : 0.0011041086399927735 , \"American_eskimo_dog\" : 0.0003261661622673273 }]} You can see that if the first model classifies the image as dog it then sends to the second model and further classifies the dog breed, if the image is classified as cat the InferenceGraph router returns the response from the first model.","title":"Test the InferenceGraph"},{"location":"modelserving/kafka/kafka/","text":"End to end inference service example with Minio and Kafka \u00b6 This example shows an end to end inference pipeline which processes an kafka event and invoke the inference service to get the prediction with provided pre/post processing code. The code for this example can be found in the kafka sample folder in the KServe repository . Deploy Kafka \u00b6 If you do not have an existing kafka cluster, you can run the following commands to install in-cluster kafka and zookeeper using helm3 with persistence turned off. helm repo add bitnami https://charts.bitnami.com/bitnami helm install zookeeper bitnami/zookeeper --set replicaCount = 1 --set auth.enabled = false --set allowAnonymousLogin = true \\ --set persistance.enabled = false --version 11 .0.0 helm install kafka bitnami/kafka --set zookeeper.enabled = false --set replicaCount = 1 --set persistance.enabled = false \\ --set logPersistance.enabled = false --set externalZookeeper.servers = zookeeper-headless.default.svc.cluster.local \\ --version 21 .0.0 After successful install you are expected to see the running kafka cluster Expected Output NAME READY STATUS RESTARTS AGE kafka-0 1 /1 Running 0 126m zookeeper-0 1 /1 Running 0 127m Install Knative Eventing and Kafka Event Source \u00b6 Install Knative Eventing Core >= 1.2 kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-crds.yaml kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-core.yaml Install Kafka Event Source . kubectl apply -f https://github.com/knative-sandbox/eventing-kafka/releases/download/knative-v1.9.1/source.yaml Install InferenceService addressable cluster role cat <<EOF | kubectl apply -f - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: inferenceservice-addressable-resolver labels: contrib.eventing.knative.dev/release: devel duck.knative.dev/addressable: \"true\" # Do not use this role directly. These rules will be added to the \"addressable-resolver\" role. rules: - apiGroups: - serving.kserve.io resources: - inferenceservices - inferenceservices/status verbs: - get - list - watch EOF Deploy Minio \u00b6 If you do not have Minio setup in your cluster, you can run following command to install Minio test instance. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: labels: app: minio name: minio spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: app: minio strategy: type: Recreate template: metadata: labels: app: minio spec: containers: - args: - server - /data env: - name: MINIO_ACCESS_KEY value: minio - name: MINIO_SECRET_KEY value: minio123 image: minio/minio:RELEASE.2020-10-18T21-54-12Z imagePullPolicy: IfNotPresent name: minio ports: - containerPort: 9000 protocol: TCP --- apiVersion: v1 kind: Service metadata: labels: app: minio name: minio-service spec: ports: - port: 9000 protocol: TCP targetPort: 9000 selector: app: minio type: ClusterIP EOF Check minio status kubectl get pods -l app = minio Expected Output NAME READY STATUS RESTARTS AGE minio-6c949866f-452x5 1 /1 Running 0 56s Install Minio client mc \u00b6 # Run port forwarding command in a different terminal kubectl port-forward $( kubectl get pod --selector = \"app=minio\" --output jsonpath = '{.items[0].metadata.name}' ) 9000 :9000 mc config host add myminio http://127.0.0.1:9000 minio minio123 Create buckets mnist for uploading images and digits for uploading the classified image. mc mb myminio/mnist mc mb myminio/digits Setup event notification to publish events to kafka. # Setup bucket event notification with kafka mc admin config set myminio notify_kafka:1 tls_skip_verify = \"off\" queue_dir = \"\" queue_limit = \"0\" sasl = \"off\" sasl_password = \"\" sasl_username = \"\" tls_client_auth = \"0\" tls = \"off\" client_tls_cert = \"\" client_tls_key = \"\" brokers = \"kafka-headless.default.svc.cluster.local:9092\" topic = \"mnist\" version = \"\" # Restart minio mc admin service restart myminio # Setup event notification when putting images to the bucket mc event add myminio/mnist arn:minio:sqs::1:kafka -p --event put --suffix .png Upload the mnist model to Minio \u00b6 gsutil cp -r gs://kfserving-examples/models/tensorflow/mnist . mc cp -r mnist myminio/ Create S3 Secret for Minio and attach to Service Account \u00b6 KServe gets the secrets from your service account, you need to add the created or existing secret to your service account's secret list. By default KServe uses default service account, user can use own service account and overwrite on InferenceService CRD. Apply the secret and attach the secret to the service account. cat <<EOF | kubectl apply -f - apiVersion: v1 kind: Secret metadata: name: mysecret annotations: serving.kserve.io/s3-endpoint: minio-service:9000 # replace with your s3 endpoint serving.kserve.io/s3-usehttps: \"0\" # by default 1, for testing with minio you need to set to 0 type: Opaque data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= --- apiVersion: v1 kind: ServiceAccount metadata: name: default secrets: - name: mysecret --- EOF Expected Output $ secret/mysecret created $ serviceaccount/default created Build mnist transformer image \u00b6 The transformation image implements the preprocess handler to process the minio notification event to download the image from minio and transform image bytes to tensors. The postprocess handler processes the prediction and upload the image to the classified minio bucket digits . docker build -t $USERNAME /mnist-transformer:latest -f ./transformer.Dockerfile . docker push $USERNAME /mnist-transformer:latest Create the InferenceService \u00b6 Specify the built image on Transformer spec and apply the inference service CRD. New Schema Old Schema cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: mnist spec: predictor: minReplicas: 1 model: modelFormat: name: tensorflow resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi runtimeVersion: 1.14.0 storageUri: s3://mnist transformer: minReplicas: 1 containers: - image: kserve/mnist-transformer:latest name: kserve-container resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi EOF cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: mnist spec: predictor: minReplicas: 1 tensorflow: resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi runtimeVersion: 1.14.0 storageUri: s3://mnist transformer: minReplicas: 1 containers: - image: kserve/mnist-transformer:latest name: kserve-container resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi EOF This creates transformer and predictor pods, the request goes to transformer first where it invokes the preprocess handler, transformer then calls out to predictor to get the prediction response which in turn invokes the postprocess handler. kubectl get pods -l serving.kserve.io/inferenceservice = mnist Expected Output NAME READY STATUS RESTARTS AGE mnist-predictor-00001-deployment-7786656484-7zrcr 2 /2 Running 0 10s mnist-transformer-00001-deployment-68b6b695f6-4qhdj 2 /2 Running 0 5s Create kafka event source \u00b6 Apply kafka event source which creates the kafka consumer pod to pull the events from kafka and deliver to inference service. cat <<EOF | kubectl apply -f - apiVersion: sources.knative.dev/v1beta1 kind: KafkaSource metadata: name: kafka-source spec: consumerGroup: knative-group # Broker URL. Replace this with the URLs for your kafka cluster, # which is in the format of my-cluster-kafka-bootstrap.my-kafka-namespace:9092. bootstrapServers: - kafka-headless.default.svc.cluster.local:9092 topics: - mnist sink: ref: apiVersion: serving.kserve.io/v1beta1 kind: InferenceService name: mnist uri: /v1/models/mnist:predict EOF Check kafka source status kubectl get kafkasource kafka-source Expected Output NAME TOPICS BOOTSTRAPSERVERS READY REASON AGE kafka-source [ \"mnist\" ] [ \"kafka-headless.default.svc.cluster.local:9092\" ] True 81s This creates the kafka source pod which consumers the events from mnist topic Expected Output NAME READY STATUS RESTARTS AGE kafkasource-kafka-source-3d809fe2-1267-11ea-99d0-42010af00zbn5h 1 /1 Running 0 75s Upload a digit image to Minio mnist bucket \u00b6 The last step is to upload the image images/0.png , image then should be moved to the classified bucket based on the prediction response! mc cp images/0.png myminio/mnist You should expect a notification event like following sent to kafka topic mnist after uploading an image in mnist bucket Expected Output { \"EventType\" : \"s3:ObjectCreated:Put\" , \"Key\" : \"mnist/0.png\" , \"Records\" :[ { \"eventVersion\" : \"2.0\" , \"eventSource\" : \"minio:s3\" , \"awsRegion\" : \"\" , \"eventTime\" : \"2019-11-17T19:08:08Z\" , \"eventName\" : \"s3:ObjectCreated:Put\" , \"userIdentity\" :{ \"principalId\" : \"minio\" }, \"requestParameters\" :{ \"sourceIPAddress\" : \"127.0.0.1:37830\" }, \"responseElements\" :{ \"x-amz-request-id\" : \"15D808BF706E0994\" , \"x-minio-origin-endpoint\" : \"http://10.244.0.71:9000\" }, \"s3\" :{ \"s3SchemaVersion\" : \"1.0\" , \"configurationId\" : \"Config\" , \"bucket\" :{ \"name\" : \"mnist\" , \"ownerIdentity\" :{ \"principalId\" : \"minio\" }, \"arn\" : \"arn:aws:s3:::mnist\" }, \"object\" :{ \"key\" : \"0.png\" , \"size\" : 324 , \"eTag\" : \"ebed21f6f77b0a64673a3c96b0c623ba\" , \"contentType\" : \"image/png\" , \"userMetadata\" :{ \"content-type\" : \"image/png\" }, \"versionId\" : \"1\" , \"sequencer\" : \"15D808BF706E0994\" }}, \"source\" :{ \"host\" : \"\" , \"port\" : \"\" , \"userAgent\" : \"\" }} ], \"level\" : \"info\" , \"msg\" : \"\" , \"time\" : \"2019-11-17T19:08:08Z\" } Check the transformer log, you should expect a prediction response and put the image to the corresponding bucket kubectl logs mnist-transformer-00001-deployment-68b6b695f6-4qhdj -c kserve-container Expected Output 2023 -04-11 08 :48:20.811 1 root INFO [ register_model () :187 ] Registering model: mnist 2023 -04-11 08 :48:20.811 1 root INFO [ start () :129 ] Setting max asyncio worker threads as 12 2023 -04-11 08 :48:20.811 1 root INFO [ serve () :139 ] Starting uvicorn with 1 workers 2023 -04-11 08 :48:21.016 1 uvicorn.error INFO [ serve () :84 ] Started server process [ 1 ] 2023 -04-11 08 :48:21.017 1 uvicorn.error INFO [ startup () :45 ] Waiting for application startup. 2023 -04-11 08 :48:21.113 1 root INFO [ start () :62 ] Starting gRPC server on [ :: ] :8081 2023 -04-11 08 :48:21 DEBUG [ timing_asgi.middleware:40 ] ASGI scope of type lifespan is not supported yet 2023 -04-11 08 :48:21.113 1 uvicorn.error INFO [ startup () :59 ] Application startup complete. 2023 -04-11 08 :55:07.439 1 root INFO [ preprocess () :47 ] Received input { 'attributes' : { 'specversion' : '1.0' , 'id' : 'partition:0/offset:0' , 'source' : '/apis/v1/namespaces/default/kafkasources/kafka-source#mnist' , 'type' : 'dev.knative.kafka.event' , 'subject' : 'partition:0#0' , 'key' : 'mnist/0.png' , 'time' : '2023-04-11T08:55:07.439730+00:00' } , 'data' : b '{\"EventName\":\"s3:ObjectCreated:Put\",\"Key\":\"mnist/0.png\",\"Records\":[{\"eventVersion\":\"2.0\",\"eventSource\":\"minio:s3\",\"awsRegion\":\"\",\"eventTime\":\"2023-04-11T08:55:07.400Z\",\"eventName\":\"s3:ObjectCreated:Put\",\"userIdentity\":{\"principalId\":\"minio\"},\"requestParameters\":{\"accessKey\":\"minio\",\"region\":\"\",\"sourceIPAddress\":\"127.0.0.1\"},\"responseElements\":{\"content-length\":\"0\",\"x-amz-request-id\":\"1754D58029FE3874\",\"x-minio-deployment-id\":\"b8f18e35-1f04-48d4-b94b-f9704a69453f\",\"x-minio-origin-endpoint\":\"http://10.244.0.164:9000\"},\"s3\":{\"s3SchemaVersion\":\"1.0\",\"configurationId\":\"Config\",\"bucket\":{\"name\":\"mnist\",\"ownerIdentity\":{\"principalId\":\"minio\"},\"arn\":\"arn:aws:s3:::mnist\"},\"object\":{\"key\":\"0.png\",\"size\":324,\"eTag\":\"ebed21f6f77b0a64673a3c96b0c623ba\",\"contentType\":\"image/png\",\"userMetadata\":{\"content-type\":\"image/png\"},\"sequencer\":\"1754D5802CCFEDA0\"}},\"source\":{\"host\":\"127.0.0.1\",\"port\":\"\",\"userAgent\":\"MinIO (linux; amd64) minio-go/v7.0.49 mc/RELEASE.2023-03-23T20-03-04Z\"}}]}' } 2023 -04-11 08 :55:07.440 1 root INFO [ preprocess () :50 ] Event data { 'EventName' : 's3:ObjectCreated:Put' , 'Key' : 'mnist/0.png' , 'Records' : [{ 'eventVersion' : '2.0' , 'eventSource' : 'minio:s3' , 'awsRegion' : '' , 'eventTime' : '2023-04-11T08:55:07.400Z' , 'eventName' : 's3:ObjectCreated:Put' , 'userIdentity' : { 'principalId' : 'minio' } , 'requestParameters' : { 'accessKey' : 'minio' , 'region' : '' , 'sourceIPAddress' : '127.0.0.1' } , 'responseElements' : { 'content-length' : '0' , 'x-amz-request-id' : '1754D58029FE3874' , 'x-minio-deployment-id' : 'b8f18e35-1f04-48d4-b94b-f9704a69453f' , 'x-minio-origin-endpoint' : 'http://10.244.0.164:9000' } , 's3' : { 's3SchemaVersion' : '1.0' , 'configurationId' : 'Config' , 'bucket' : { 'name' : 'mnist' , 'ownerIdentity' : { 'principalId' : 'minio' } , 'arn' : 'arn:aws:s3:::mnist' } , 'object' : { 'key' : '0.png' , 'size' : 324 , 'eTag' : 'ebed21f6f77b0a64673a3c96b0c623ba' , 'contentType' : 'image/png' , 'userMetadata' : { 'content-type' : 'image/png' } , 'sequencer' : '1754D5802CCFEDA0' }} , 'source' : { 'host' : '127.0.0.1' , 'port' : '' , 'userAgent' : 'MinIO (linux; amd64) minio-go/v7.0.49 mc/RELEASE.2023-03-23T20-03-04Z' }}]} 2023 -04-11 08 :55:07.813 1 root INFO [ postprocess () :62 ] response: { 'predictions' : [{ 'predictions' : [ 0 .0247901566, 1 .37231364e-05, 0 .0202635303, 0 .39037028, 0 .000513458275, 0 .435112566, 0 .000607515569, 0 .00041125578, 0 .127784252, 0 .000133168287 ] , 'classes' : 5 }]} 2023 -04-11 08 :55:07.813 1 root INFO [ postprocess () :64 ] digit:5 2023 -04-11 08 :55:07.833 1 root INFO [ postprocess () :67 ] Image 0 .png successfully uploaded to digit-5/0.png 2023 -04-11 08 :55:07.833 1 root INFO [ __call__ () :128 ] requestId: 13ce4fc0-6723-4230-8d2f-2e6bc6f68933, preprocess_ms: 75 .415849686, explain_ms: 0 , predict_ms: 297 .461032867, postprocess_ms: 20 .473003387 2023 -04-11 08 :55:07.834 1 root INFO [ timing () :48 ] kserve.io.kserve.protocol.rest.v1_endpoints.predict 0 .39476728439331055, [ 'http_status:200' , 'http_method:POST' , 'time:wall' ] 2023 -04-11 08 :55:07.834 1 root INFO [ timing () :48 ] kserve.io.kserve.protocol.rest.v1_endpoints.predict 0 .03562399999999988, [ 'http_status:200' , 'http_method:POST' , 'time:cpu' ]","title":"Inference with Kafka event source"},{"location":"modelserving/kafka/kafka/#end-to-end-inference-service-example-with-minio-and-kafka","text":"This example shows an end to end inference pipeline which processes an kafka event and invoke the inference service to get the prediction with provided pre/post processing code. The code for this example can be found in the kafka sample folder in the KServe repository .","title":"End to end inference service example with Minio and Kafka"},{"location":"modelserving/kafka/kafka/#deploy-kafka","text":"If you do not have an existing kafka cluster, you can run the following commands to install in-cluster kafka and zookeeper using helm3 with persistence turned off. helm repo add bitnami https://charts.bitnami.com/bitnami helm install zookeeper bitnami/zookeeper --set replicaCount = 1 --set auth.enabled = false --set allowAnonymousLogin = true \\ --set persistance.enabled = false --version 11 .0.0 helm install kafka bitnami/kafka --set zookeeper.enabled = false --set replicaCount = 1 --set persistance.enabled = false \\ --set logPersistance.enabled = false --set externalZookeeper.servers = zookeeper-headless.default.svc.cluster.local \\ --version 21 .0.0 After successful install you are expected to see the running kafka cluster Expected Output NAME READY STATUS RESTARTS AGE kafka-0 1 /1 Running 0 126m zookeeper-0 1 /1 Running 0 127m","title":"Deploy Kafka"},{"location":"modelserving/kafka/kafka/#install-knative-eventing-and-kafka-event-source","text":"Install Knative Eventing Core >= 1.2 kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-crds.yaml kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-core.yaml Install Kafka Event Source . kubectl apply -f https://github.com/knative-sandbox/eventing-kafka/releases/download/knative-v1.9.1/source.yaml Install InferenceService addressable cluster role cat <<EOF | kubectl apply -f - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: inferenceservice-addressable-resolver labels: contrib.eventing.knative.dev/release: devel duck.knative.dev/addressable: \"true\" # Do not use this role directly. These rules will be added to the \"addressable-resolver\" role. rules: - apiGroups: - serving.kserve.io resources: - inferenceservices - inferenceservices/status verbs: - get - list - watch EOF","title":"Install Knative Eventing and Kafka Event Source"},{"location":"modelserving/kafka/kafka/#deploy-minio","text":"If you do not have Minio setup in your cluster, you can run following command to install Minio test instance. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: labels: app: minio name: minio spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: app: minio strategy: type: Recreate template: metadata: labels: app: minio spec: containers: - args: - server - /data env: - name: MINIO_ACCESS_KEY value: minio - name: MINIO_SECRET_KEY value: minio123 image: minio/minio:RELEASE.2020-10-18T21-54-12Z imagePullPolicy: IfNotPresent name: minio ports: - containerPort: 9000 protocol: TCP --- apiVersion: v1 kind: Service metadata: labels: app: minio name: minio-service spec: ports: - port: 9000 protocol: TCP targetPort: 9000 selector: app: minio type: ClusterIP EOF Check minio status kubectl get pods -l app = minio Expected Output NAME READY STATUS RESTARTS AGE minio-6c949866f-452x5 1 /1 Running 0 56s","title":"Deploy Minio"},{"location":"modelserving/kafka/kafka/#install-minio-client-mc","text":"# Run port forwarding command in a different terminal kubectl port-forward $( kubectl get pod --selector = \"app=minio\" --output jsonpath = '{.items[0].metadata.name}' ) 9000 :9000 mc config host add myminio http://127.0.0.1:9000 minio minio123 Create buckets mnist for uploading images and digits for uploading the classified image. mc mb myminio/mnist mc mb myminio/digits Setup event notification to publish events to kafka. # Setup bucket event notification with kafka mc admin config set myminio notify_kafka:1 tls_skip_verify = \"off\" queue_dir = \"\" queue_limit = \"0\" sasl = \"off\" sasl_password = \"\" sasl_username = \"\" tls_client_auth = \"0\" tls = \"off\" client_tls_cert = \"\" client_tls_key = \"\" brokers = \"kafka-headless.default.svc.cluster.local:9092\" topic = \"mnist\" version = \"\" # Restart minio mc admin service restart myminio # Setup event notification when putting images to the bucket mc event add myminio/mnist arn:minio:sqs::1:kafka -p --event put --suffix .png","title":"Install Minio client mc"},{"location":"modelserving/kafka/kafka/#upload-the-mnist-model-to-minio","text":"gsutil cp -r gs://kfserving-examples/models/tensorflow/mnist . mc cp -r mnist myminio/","title":"Upload the mnist model to Minio"},{"location":"modelserving/kafka/kafka/#create-s3-secret-for-minio-and-attach-to-service-account","text":"KServe gets the secrets from your service account, you need to add the created or existing secret to your service account's secret list. By default KServe uses default service account, user can use own service account and overwrite on InferenceService CRD. Apply the secret and attach the secret to the service account. cat <<EOF | kubectl apply -f - apiVersion: v1 kind: Secret metadata: name: mysecret annotations: serving.kserve.io/s3-endpoint: minio-service:9000 # replace with your s3 endpoint serving.kserve.io/s3-usehttps: \"0\" # by default 1, for testing with minio you need to set to 0 type: Opaque data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= --- apiVersion: v1 kind: ServiceAccount metadata: name: default secrets: - name: mysecret --- EOF Expected Output $ secret/mysecret created $ serviceaccount/default created","title":"Create S3 Secret for Minio and attach to Service Account"},{"location":"modelserving/kafka/kafka/#build-mnist-transformer-image","text":"The transformation image implements the preprocess handler to process the minio notification event to download the image from minio and transform image bytes to tensors. The postprocess handler processes the prediction and upload the image to the classified minio bucket digits . docker build -t $USERNAME /mnist-transformer:latest -f ./transformer.Dockerfile . docker push $USERNAME /mnist-transformer:latest","title":"Build mnist transformer image"},{"location":"modelserving/kafka/kafka/#create-the-inferenceservice","text":"Specify the built image on Transformer spec and apply the inference service CRD. New Schema Old Schema cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: mnist spec: predictor: minReplicas: 1 model: modelFormat: name: tensorflow resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi runtimeVersion: 1.14.0 storageUri: s3://mnist transformer: minReplicas: 1 containers: - image: kserve/mnist-transformer:latest name: kserve-container resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi EOF cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: mnist spec: predictor: minReplicas: 1 tensorflow: resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi runtimeVersion: 1.14.0 storageUri: s3://mnist transformer: minReplicas: 1 containers: - image: kserve/mnist-transformer:latest name: kserve-container resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi EOF This creates transformer and predictor pods, the request goes to transformer first where it invokes the preprocess handler, transformer then calls out to predictor to get the prediction response which in turn invokes the postprocess handler. kubectl get pods -l serving.kserve.io/inferenceservice = mnist Expected Output NAME READY STATUS RESTARTS AGE mnist-predictor-00001-deployment-7786656484-7zrcr 2 /2 Running 0 10s mnist-transformer-00001-deployment-68b6b695f6-4qhdj 2 /2 Running 0 5s","title":"Create the InferenceService"},{"location":"modelserving/kafka/kafka/#create-kafka-event-source","text":"Apply kafka event source which creates the kafka consumer pod to pull the events from kafka and deliver to inference service. cat <<EOF | kubectl apply -f - apiVersion: sources.knative.dev/v1beta1 kind: KafkaSource metadata: name: kafka-source spec: consumerGroup: knative-group # Broker URL. Replace this with the URLs for your kafka cluster, # which is in the format of my-cluster-kafka-bootstrap.my-kafka-namespace:9092. bootstrapServers: - kafka-headless.default.svc.cluster.local:9092 topics: - mnist sink: ref: apiVersion: serving.kserve.io/v1beta1 kind: InferenceService name: mnist uri: /v1/models/mnist:predict EOF Check kafka source status kubectl get kafkasource kafka-source Expected Output NAME TOPICS BOOTSTRAPSERVERS READY REASON AGE kafka-source [ \"mnist\" ] [ \"kafka-headless.default.svc.cluster.local:9092\" ] True 81s This creates the kafka source pod which consumers the events from mnist topic Expected Output NAME READY STATUS RESTARTS AGE kafkasource-kafka-source-3d809fe2-1267-11ea-99d0-42010af00zbn5h 1 /1 Running 0 75s","title":"Create kafka event source"},{"location":"modelserving/kafka/kafka/#upload-a-digit-image-to-minio-mnist-bucket","text":"The last step is to upload the image images/0.png , image then should be moved to the classified bucket based on the prediction response! mc cp images/0.png myminio/mnist You should expect a notification event like following sent to kafka topic mnist after uploading an image in mnist bucket Expected Output { \"EventType\" : \"s3:ObjectCreated:Put\" , \"Key\" : \"mnist/0.png\" , \"Records\" :[ { \"eventVersion\" : \"2.0\" , \"eventSource\" : \"minio:s3\" , \"awsRegion\" : \"\" , \"eventTime\" : \"2019-11-17T19:08:08Z\" , \"eventName\" : \"s3:ObjectCreated:Put\" , \"userIdentity\" :{ \"principalId\" : \"minio\" }, \"requestParameters\" :{ \"sourceIPAddress\" : \"127.0.0.1:37830\" }, \"responseElements\" :{ \"x-amz-request-id\" : \"15D808BF706E0994\" , \"x-minio-origin-endpoint\" : \"http://10.244.0.71:9000\" }, \"s3\" :{ \"s3SchemaVersion\" : \"1.0\" , \"configurationId\" : \"Config\" , \"bucket\" :{ \"name\" : \"mnist\" , \"ownerIdentity\" :{ \"principalId\" : \"minio\" }, \"arn\" : \"arn:aws:s3:::mnist\" }, \"object\" :{ \"key\" : \"0.png\" , \"size\" : 324 , \"eTag\" : \"ebed21f6f77b0a64673a3c96b0c623ba\" , \"contentType\" : \"image/png\" , \"userMetadata\" :{ \"content-type\" : \"image/png\" }, \"versionId\" : \"1\" , \"sequencer\" : \"15D808BF706E0994\" }}, \"source\" :{ \"host\" : \"\" , \"port\" : \"\" , \"userAgent\" : \"\" }} ], \"level\" : \"info\" , \"msg\" : \"\" , \"time\" : \"2019-11-17T19:08:08Z\" } Check the transformer log, you should expect a prediction response and put the image to the corresponding bucket kubectl logs mnist-transformer-00001-deployment-68b6b695f6-4qhdj -c kserve-container Expected Output 2023 -04-11 08 :48:20.811 1 root INFO [ register_model () :187 ] Registering model: mnist 2023 -04-11 08 :48:20.811 1 root INFO [ start () :129 ] Setting max asyncio worker threads as 12 2023 -04-11 08 :48:20.811 1 root INFO [ serve () :139 ] Starting uvicorn with 1 workers 2023 -04-11 08 :48:21.016 1 uvicorn.error INFO [ serve () :84 ] Started server process [ 1 ] 2023 -04-11 08 :48:21.017 1 uvicorn.error INFO [ startup () :45 ] Waiting for application startup. 2023 -04-11 08 :48:21.113 1 root INFO [ start () :62 ] Starting gRPC server on [ :: ] :8081 2023 -04-11 08 :48:21 DEBUG [ timing_asgi.middleware:40 ] ASGI scope of type lifespan is not supported yet 2023 -04-11 08 :48:21.113 1 uvicorn.error INFO [ startup () :59 ] Application startup complete. 2023 -04-11 08 :55:07.439 1 root INFO [ preprocess () :47 ] Received input { 'attributes' : { 'specversion' : '1.0' , 'id' : 'partition:0/offset:0' , 'source' : '/apis/v1/namespaces/default/kafkasources/kafka-source#mnist' , 'type' : 'dev.knative.kafka.event' , 'subject' : 'partition:0#0' , 'key' : 'mnist/0.png' , 'time' : '2023-04-11T08:55:07.439730+00:00' } , 'data' : b '{\"EventName\":\"s3:ObjectCreated:Put\",\"Key\":\"mnist/0.png\",\"Records\":[{\"eventVersion\":\"2.0\",\"eventSource\":\"minio:s3\",\"awsRegion\":\"\",\"eventTime\":\"2023-04-11T08:55:07.400Z\",\"eventName\":\"s3:ObjectCreated:Put\",\"userIdentity\":{\"principalId\":\"minio\"},\"requestParameters\":{\"accessKey\":\"minio\",\"region\":\"\",\"sourceIPAddress\":\"127.0.0.1\"},\"responseElements\":{\"content-length\":\"0\",\"x-amz-request-id\":\"1754D58029FE3874\",\"x-minio-deployment-id\":\"b8f18e35-1f04-48d4-b94b-f9704a69453f\",\"x-minio-origin-endpoint\":\"http://10.244.0.164:9000\"},\"s3\":{\"s3SchemaVersion\":\"1.0\",\"configurationId\":\"Config\",\"bucket\":{\"name\":\"mnist\",\"ownerIdentity\":{\"principalId\":\"minio\"},\"arn\":\"arn:aws:s3:::mnist\"},\"object\":{\"key\":\"0.png\",\"size\":324,\"eTag\":\"ebed21f6f77b0a64673a3c96b0c623ba\",\"contentType\":\"image/png\",\"userMetadata\":{\"content-type\":\"image/png\"},\"sequencer\":\"1754D5802CCFEDA0\"}},\"source\":{\"host\":\"127.0.0.1\",\"port\":\"\",\"userAgent\":\"MinIO (linux; amd64) minio-go/v7.0.49 mc/RELEASE.2023-03-23T20-03-04Z\"}}]}' } 2023 -04-11 08 :55:07.440 1 root INFO [ preprocess () :50 ] Event data { 'EventName' : 's3:ObjectCreated:Put' , 'Key' : 'mnist/0.png' , 'Records' : [{ 'eventVersion' : '2.0' , 'eventSource' : 'minio:s3' , 'awsRegion' : '' , 'eventTime' : '2023-04-11T08:55:07.400Z' , 'eventName' : 's3:ObjectCreated:Put' , 'userIdentity' : { 'principalId' : 'minio' } , 'requestParameters' : { 'accessKey' : 'minio' , 'region' : '' , 'sourceIPAddress' : '127.0.0.1' } , 'responseElements' : { 'content-length' : '0' , 'x-amz-request-id' : '1754D58029FE3874' , 'x-minio-deployment-id' : 'b8f18e35-1f04-48d4-b94b-f9704a69453f' , 'x-minio-origin-endpoint' : 'http://10.244.0.164:9000' } , 's3' : { 's3SchemaVersion' : '1.0' , 'configurationId' : 'Config' , 'bucket' : { 'name' : 'mnist' , 'ownerIdentity' : { 'principalId' : 'minio' } , 'arn' : 'arn:aws:s3:::mnist' } , 'object' : { 'key' : '0.png' , 'size' : 324 , 'eTag' : 'ebed21f6f77b0a64673a3c96b0c623ba' , 'contentType' : 'image/png' , 'userMetadata' : { 'content-type' : 'image/png' } , 'sequencer' : '1754D5802CCFEDA0' }} , 'source' : { 'host' : '127.0.0.1' , 'port' : '' , 'userAgent' : 'MinIO (linux; amd64) minio-go/v7.0.49 mc/RELEASE.2023-03-23T20-03-04Z' }}]} 2023 -04-11 08 :55:07.813 1 root INFO [ postprocess () :62 ] response: { 'predictions' : [{ 'predictions' : [ 0 .0247901566, 1 .37231364e-05, 0 .0202635303, 0 .39037028, 0 .000513458275, 0 .435112566, 0 .000607515569, 0 .00041125578, 0 .127784252, 0 .000133168287 ] , 'classes' : 5 }]} 2023 -04-11 08 :55:07.813 1 root INFO [ postprocess () :64 ] digit:5 2023 -04-11 08 :55:07.833 1 root INFO [ postprocess () :67 ] Image 0 .png successfully uploaded to digit-5/0.png 2023 -04-11 08 :55:07.833 1 root INFO [ __call__ () :128 ] requestId: 13ce4fc0-6723-4230-8d2f-2e6bc6f68933, preprocess_ms: 75 .415849686, explain_ms: 0 , predict_ms: 297 .461032867, postprocess_ms: 20 .473003387 2023 -04-11 08 :55:07.834 1 root INFO [ timing () :48 ] kserve.io.kserve.protocol.rest.v1_endpoints.predict 0 .39476728439331055, [ 'http_status:200' , 'http_method:POST' , 'time:wall' ] 2023 -04-11 08 :55:07.834 1 root INFO [ timing () :48 ] kserve.io.kserve.protocol.rest.v1_endpoints.predict 0 .03562399999999988, [ 'http_status:200' , 'http_method:POST' , 'time:cpu' ]","title":"Upload a digit image to Minio mnist bucket"},{"location":"modelserving/logger/logger/","text":"Inference Logger \u00b6 Basic Inference Logger \u00b6 Create Message Dumper \u00b6 Create a message dumper Knative Service which will print out the CloudEvents it receives. yaml apiVersion : serving.knative.dev/v1 kind : Service metadata : name : message-dumper spec : template : spec : containers : - image : gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display kubectl kubectl create -f message-dumper.yaml Create an InferenceService with Logger \u00b6 Create a sklearn predictor with the logger which points at the message dumper url. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : logger : mode : all url : http://message-dumper.default/ model : modelFormat : name : sklearn storageUri : gs://kfserving-examples/models/sklearn/1.0/model apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : logger : mode : all url : http://message-dumper.default/ sklearn : storageUri : gs://kfserving-examples/models/sklearn/1.0/model Note Here we set the url explicitly, otherwise it defaults to the namespace knative broker or the value of DefaultUrl in the logger section of the inference service configmap. kubectl kubectl create -f sklearn-basic-logger.yaml We can now send a request to the sklearn model. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = sklearn-iris INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output { \"predictions\" : [ 1 , 1 ] } Check CloudEvents \u00b6 Check the logs of the message dumper, we can see the CloudEvents associated with our previous curl request. kubectl kubectl logs $( kubectl get pod -l serving.knative.dev/service = message-dumper -o jsonpath = '{.items[0].metadata.name}' ) user-container Expected Output \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.request source: http://localhost:9081/ id: 0009174a-24a8-4603-b098-09c8799950e9 time: 2021 -04-10T00:23:26.0789529Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris namespace: default traceparent: 00 -90bdf848647d50283394155d2df58f19-84dacdfdf07cadfc-00 Data, { \"instances\" : [ [ 6 .8, 2 .8, 4 .8, 1 .4 ] , [ 6 .0, 3 .4, 4 .5, 1 .6 ] ] } \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.response source: http://localhost:9081/ id: 0009174a-24a8-4603-b098-09c8799950e9 time: 2021 -04-10T00:23:26.080736102Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris namespace: default traceparent: 00 -55de1514e1d23ee17eb50dda6167bb8c-b6c6e0f6dd8f741d-00 Data, { \"predictions\" : [ 1 , 1 ] } Knative Eventing Inference Logger \u00b6 A cluster running with Knative Eventing installed , along with KServe. Note This was tested using Knative Eventing v0.17. Create Message Dumper \u00b6 Create a message dumper Knative service which will print out the CloudEvents it receives. yaml apiVersion : serving.knative.dev/v1 kind : Service metadata : name : message-dumper spec : template : spec : containers : - image : gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display kubectl kubectl apply -f message-dumper.yaml Create Channel Broker \u00b6 Create a Broker which allows you route events to consumers like InferenceService. yaml apiVersion : eventing.knative.dev/v1 kind : broker metadata : name : default kubectl kubectl apply -f broker.yaml kubectl get broker default Take note of the broker URL as that is what we'll be using in the InferenceService later on. Create Trigger \u00b6 We now create a trigger to forward the events to message-dumper service. The trigger can specify a filter that enables selection of relevant events based on the Cloud Event context attributes. yaml apiVersion : eventing.knative.dev/v1 kind : Trigger metadata : name : message-dumper-trigger spec : broker : default subscriber : ref : apiVersion : serving.knative.dev/v1 kind : Service name : message-dumper kubectl kubectl create -f trigger.yaml Create an InferenceService with Logger \u00b6 Create a sklearn predictor with the logger url pointing to the Knative eventing multi-tenant broker in knative-eventing namespace. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : minReplicas : 1 logger : mode : all url : http://broker-ingress.knative-eventing.svc.cluster.local/default/default model : modelFormat : name : sklearn storageUri : gs://kfserving-examples/models/sklearn/1.0/model apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : minReplicas : 1 logger : mode : all url : http://broker-ingress.knative-eventing.svc.cluster.local/default/default sklearn : storageUri : gs://kfserving-examples/models/sklearn/1.0/model Apply the sklearn-knative-eventing.yaml . kubectl kubectl create -f sklearn-knative-eventing.yaml We can now send a request to the sklearn model. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = sklearn-iris INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output { \"predictions\" : [ 1 , 1 ] } Check CloudEvents \u00b6 Check the logs of the message dumper, we can see the CloudEvents associated with our previous curl request. kubectl kubectl logs $( kubectl get pod -l serving.knative.dev/service = message-dumper -o jsonpath = '{.items[0].metadata.name}' ) user-container Expected Output \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.request source: http://localhost:9081/ id: defb5816-35f7-4947-a2b1-b9e5d7764ad2 time: 2021 -04-10T01:22:16.498917288Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris knativearrivaltime: 2021 -04-10T01:22:16.500656431Z knativehistory: default-kne-trigger-kn-channel.default.svc.cluster.local namespace: default traceparent: 00 -16456300519c5227ffe5f784a88da2f7-2db26af1daae870c-00 Data, { \"instances\" : [ [ 6 .8, 2 .8, 4 .8, 1 .4 ] , [ 6 .0, 3 .4, 4 .5, 1 .6 ] ] } \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.response source: http://localhost:9081/ id: defb5816-35f7-4947-a2b1-b9e5d7764ad2 time: 2021 -04-10T01:22:16.500492939Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris knativearrivaltime: 2021 -04-10T01:22:16.501931207Z knativehistory: default-kne-trigger-kn-channel.default.svc.cluster.local namespace: default traceparent: 00 -2156a24451a4d4ea575fcf6c4f52a672-2b6ea035c83d3200-00 Data, { \"predictions\" : [ 1 , 1 ] }","title":"Inference Logger"},{"location":"modelserving/logger/logger/#inference-logger","text":"","title":"Inference Logger"},{"location":"modelserving/logger/logger/#basic-inference-logger","text":"","title":"Basic Inference Logger"},{"location":"modelserving/logger/logger/#create-message-dumper","text":"Create a message dumper Knative Service which will print out the CloudEvents it receives. yaml apiVersion : serving.knative.dev/v1 kind : Service metadata : name : message-dumper spec : template : spec : containers : - image : gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display kubectl kubectl create -f message-dumper.yaml","title":"Create Message Dumper"},{"location":"modelserving/logger/logger/#create-an-inferenceservice-with-logger","text":"Create a sklearn predictor with the logger which points at the message dumper url. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : logger : mode : all url : http://message-dumper.default/ model : modelFormat : name : sklearn storageUri : gs://kfserving-examples/models/sklearn/1.0/model apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : logger : mode : all url : http://message-dumper.default/ sklearn : storageUri : gs://kfserving-examples/models/sklearn/1.0/model Note Here we set the url explicitly, otherwise it defaults to the namespace knative broker or the value of DefaultUrl in the logger section of the inference service configmap. kubectl kubectl create -f sklearn-basic-logger.yaml We can now send a request to the sklearn model. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = sklearn-iris INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output { \"predictions\" : [ 1 , 1 ] }","title":"Create an InferenceService with Logger"},{"location":"modelserving/logger/logger/#check-cloudevents","text":"Check the logs of the message dumper, we can see the CloudEvents associated with our previous curl request. kubectl kubectl logs $( kubectl get pod -l serving.knative.dev/service = message-dumper -o jsonpath = '{.items[0].metadata.name}' ) user-container Expected Output \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.request source: http://localhost:9081/ id: 0009174a-24a8-4603-b098-09c8799950e9 time: 2021 -04-10T00:23:26.0789529Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris namespace: default traceparent: 00 -90bdf848647d50283394155d2df58f19-84dacdfdf07cadfc-00 Data, { \"instances\" : [ [ 6 .8, 2 .8, 4 .8, 1 .4 ] , [ 6 .0, 3 .4, 4 .5, 1 .6 ] ] } \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.response source: http://localhost:9081/ id: 0009174a-24a8-4603-b098-09c8799950e9 time: 2021 -04-10T00:23:26.080736102Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris namespace: default traceparent: 00 -55de1514e1d23ee17eb50dda6167bb8c-b6c6e0f6dd8f741d-00 Data, { \"predictions\" : [ 1 , 1 ] }","title":"Check CloudEvents"},{"location":"modelserving/logger/logger/#knative-eventing-inference-logger","text":"A cluster running with Knative Eventing installed , along with KServe. Note This was tested using Knative Eventing v0.17.","title":"Knative Eventing Inference Logger"},{"location":"modelserving/logger/logger/#create-message-dumper_1","text":"Create a message dumper Knative service which will print out the CloudEvents it receives. yaml apiVersion : serving.knative.dev/v1 kind : Service metadata : name : message-dumper spec : template : spec : containers : - image : gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display kubectl kubectl apply -f message-dumper.yaml","title":"Create Message Dumper"},{"location":"modelserving/logger/logger/#create-channel-broker","text":"Create a Broker which allows you route events to consumers like InferenceService. yaml apiVersion : eventing.knative.dev/v1 kind : broker metadata : name : default kubectl kubectl apply -f broker.yaml kubectl get broker default Take note of the broker URL as that is what we'll be using in the InferenceService later on.","title":"Create Channel Broker"},{"location":"modelserving/logger/logger/#create-trigger","text":"We now create a trigger to forward the events to message-dumper service. The trigger can specify a filter that enables selection of relevant events based on the Cloud Event context attributes. yaml apiVersion : eventing.knative.dev/v1 kind : Trigger metadata : name : message-dumper-trigger spec : broker : default subscriber : ref : apiVersion : serving.knative.dev/v1 kind : Service name : message-dumper kubectl kubectl create -f trigger.yaml","title":"Create Trigger"},{"location":"modelserving/logger/logger/#create-an-inferenceservice-with-logger_1","text":"Create a sklearn predictor with the logger url pointing to the Knative eventing multi-tenant broker in knative-eventing namespace. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : minReplicas : 1 logger : mode : all url : http://broker-ingress.knative-eventing.svc.cluster.local/default/default model : modelFormat : name : sklearn storageUri : gs://kfserving-examples/models/sklearn/1.0/model apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : minReplicas : 1 logger : mode : all url : http://broker-ingress.knative-eventing.svc.cluster.local/default/default sklearn : storageUri : gs://kfserving-examples/models/sklearn/1.0/model Apply the sklearn-knative-eventing.yaml . kubectl kubectl create -f sklearn-knative-eventing.yaml We can now send a request to the sklearn model. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = sklearn-iris INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output { \"predictions\" : [ 1 , 1 ] }","title":"Create an InferenceService with Logger"},{"location":"modelserving/logger/logger/#check-cloudevents_1","text":"Check the logs of the message dumper, we can see the CloudEvents associated with our previous curl request. kubectl kubectl logs $( kubectl get pod -l serving.knative.dev/service = message-dumper -o jsonpath = '{.items[0].metadata.name}' ) user-container Expected Output \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.request source: http://localhost:9081/ id: defb5816-35f7-4947-a2b1-b9e5d7764ad2 time: 2021 -04-10T01:22:16.498917288Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris knativearrivaltime: 2021 -04-10T01:22:16.500656431Z knativehistory: default-kne-trigger-kn-channel.default.svc.cluster.local namespace: default traceparent: 00 -16456300519c5227ffe5f784a88da2f7-2db26af1daae870c-00 Data, { \"instances\" : [ [ 6 .8, 2 .8, 4 .8, 1 .4 ] , [ 6 .0, 3 .4, 4 .5, 1 .6 ] ] } \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.response source: http://localhost:9081/ id: defb5816-35f7-4947-a2b1-b9e5d7764ad2 time: 2021 -04-10T01:22:16.500492939Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris knativearrivaltime: 2021 -04-10T01:22:16.501931207Z knativehistory: default-kne-trigger-kn-channel.default.svc.cluster.local namespace: default traceparent: 00 -2156a24451a4d4ea575fcf6c4f52a672-2b6ea035c83d3200-00 Data, { \"predictions\" : [ 1 , 1 ] }","title":"Check CloudEvents"},{"location":"modelserving/mms/multi-model-serving/","text":"The model deployment scalability problem \u00b6 With machine learning approaches becoming more widely adopted in organizations, there is a trend to deploy a large number of models. For example, a news classification service may train custom models for each news category. Another important reason why organizations desire to train a lot of models is to protect data privacy, as it is safer to isolate each user's data and train models separately. While you get the benefit of better inference accuracy and data privacy by building models for each use case, it is more challenging to deploy thousands to hundreds of thousands of models on a Kubernetes cluster. Furthermore, there are an increasing number of use cases of serving neural network-based models. To achieve reasonable latency, those models are better served on GPUs. However, since GPUs are expensive resources, it is costly to serve many GPU-based models. The original design of KServe deploys one model per InferenceService. But, when dealing with a large number of models, its 'one model, one server' paradigm presents challenges for a Kubernetes cluster. To scale the number of models, we have to scale the number of InferenceServices, something that can quickly challenge the cluster's limits. Multi-model serving is designed to address three types of limitations KServe will run into: Compute resource limitation Maximum pods limitation Maximum IP address limitation. Compute resource limitation \u00b6 Each InferenceService has a resource overhead because of the sidecars injected into each pod. This normally adds about 0.5 CPU and 0.5G Memory resource per InferenceService replica. For example, if we deploy 10 models, each with 2 replicas, then the resource overhead is 10 * 2 * 0.5 = 10 CPU and 10 * 2 * 0.5 = 10 GB memory. Each model\u2019s resource overhead is 1CPU and 1 GB memory. Deploying many models using the current approach will quickly use up a cluster's computing resource. With Multi-model serving, these models can be loaded in one InferenceService, then each model's average overhead is 0.1 CPU and 0.1GB memory. For GPU based models, the number of GPUs required grows linearly as the number of models grows, which is not cost efficient. If multiple models can be loaded in one GPU enabled model server such as TritonServer, we need a lot less GPUs in the cluster. Maximum pods limitation \u00b6 Kubelet has a maximum number of pods per node with the default limit set to 110 . According to Kubernetes best practice , a node shouldn't run more than 100 pods. With this limitation, a typical 50-node cluster with default pod limit can run at most 1000 models assuming each InferenceService has 4 pods on average (two transformer replicas and two predictor replicas). Maximum IP address limitation. \u00b6 Kubernetes clusters also have an IP address limit per cluster. Each pod in InferenceService needs an independent IP. For example a cluster with 4096 IP addresses can deploy at most 1024 models assuming each InferenceService has 4 pods on average (two transformer replicas and two predictor replicas). Benefit of using ModelMesh for Multi-Model serving \u00b6 Multi-model serving with ModelMesh addresses the three limitations above. It decreases the average resource overhead per model so model deployment becomes more cost efficient. And the number of models which can be deployed in a cluster will no longer be limited by the maximum pods limitation and the maximum IP address limitation. Learn more about ModelMesh here .","title":"The Scalability Problem"},{"location":"modelserving/mms/multi-model-serving/#the-model-deployment-scalability-problem","text":"With machine learning approaches becoming more widely adopted in organizations, there is a trend to deploy a large number of models. For example, a news classification service may train custom models for each news category. Another important reason why organizations desire to train a lot of models is to protect data privacy, as it is safer to isolate each user's data and train models separately. While you get the benefit of better inference accuracy and data privacy by building models for each use case, it is more challenging to deploy thousands to hundreds of thousands of models on a Kubernetes cluster. Furthermore, there are an increasing number of use cases of serving neural network-based models. To achieve reasonable latency, those models are better served on GPUs. However, since GPUs are expensive resources, it is costly to serve many GPU-based models. The original design of KServe deploys one model per InferenceService. But, when dealing with a large number of models, its 'one model, one server' paradigm presents challenges for a Kubernetes cluster. To scale the number of models, we have to scale the number of InferenceServices, something that can quickly challenge the cluster's limits. Multi-model serving is designed to address three types of limitations KServe will run into: Compute resource limitation Maximum pods limitation Maximum IP address limitation.","title":"The model deployment scalability problem"},{"location":"modelserving/mms/multi-model-serving/#compute-resource-limitation","text":"Each InferenceService has a resource overhead because of the sidecars injected into each pod. This normally adds about 0.5 CPU and 0.5G Memory resource per InferenceService replica. For example, if we deploy 10 models, each with 2 replicas, then the resource overhead is 10 * 2 * 0.5 = 10 CPU and 10 * 2 * 0.5 = 10 GB memory. Each model\u2019s resource overhead is 1CPU and 1 GB memory. Deploying many models using the current approach will quickly use up a cluster's computing resource. With Multi-model serving, these models can be loaded in one InferenceService, then each model's average overhead is 0.1 CPU and 0.1GB memory. For GPU based models, the number of GPUs required grows linearly as the number of models grows, which is not cost efficient. If multiple models can be loaded in one GPU enabled model server such as TritonServer, we need a lot less GPUs in the cluster.","title":"Compute resource limitation"},{"location":"modelserving/mms/multi-model-serving/#maximum-pods-limitation","text":"Kubelet has a maximum number of pods per node with the default limit set to 110 . According to Kubernetes best practice , a node shouldn't run more than 100 pods. With this limitation, a typical 50-node cluster with default pod limit can run at most 1000 models assuming each InferenceService has 4 pods on average (two transformer replicas and two predictor replicas).","title":"Maximum pods limitation"},{"location":"modelserving/mms/multi-model-serving/#maximum-ip-address-limitation","text":"Kubernetes clusters also have an IP address limit per cluster. Each pod in InferenceService needs an independent IP. For example a cluster with 4096 IP addresses can deploy at most 1024 models assuming each InferenceService has 4 pods on average (two transformer replicas and two predictor replicas).","title":"Maximum IP address limitation."},{"location":"modelserving/mms/multi-model-serving/#benefit-of-using-modelmesh-for-multi-model-serving","text":"Multi-model serving with ModelMesh addresses the three limitations above. It decreases the average resource overhead per model so model deployment becomes more cost efficient. And the number of models which can be deployed in a cluster will no longer be limited by the maximum pods limitation and the maximum IP address limitation. Learn more about ModelMesh here .","title":"Benefit of using ModelMesh for Multi-Model serving"},{"location":"modelserving/mms/modelmesh/overview/","text":"ModelMesh Serving \u00b6 Multi-model serving with ModelMesh is an alpha feature added recently to increase KServe\u2019s scalability. Please assume that the interface is subject to changes. Overview \u00b6 ModelMesh Serving is a Kubernetes-based platform for realtime serving of ML/DL models, optimized for high volume/density use cases. Utilization of available system resources is maximized via intelligent management of in-memory model data across clusters of deployed Pods, based on usage of those models over time. Leveraging existing third-party model servers, a number of standard ML/DL model formats are supported out-of-the box with more to follow: TensorFlow, PyTorch ScriptModule, ONNX, scikit-learn, XGBoost, LightGBM, OpenVINO IR. It's also possible to extend with custom runtimes to support arbitrary model formats. The architecture comprises a controller Pod that orchestrates one or more Kubernetes \"model runtime\" Deployments which load/serve the models, and a Service that accepts inferencing requests. A routing layer spanning the runtime pods ensures that models are loaded in the right places at the right times and handles forwarding of those requests. The model data itself is pulled from one or more external storage instances which must be configured in a Secret. We currently support only S3-based object storage (self-managed storage is also an option for custom runtimes), but more options will be supported soon. ModelMesh Serving makes use of two core Kubernetes Custom Resource types: ServingRuntime - Templates for Pods that can serve one or more particular model formats. There are three \"built in\" runtimes that cover the out-of-the-box model types, custom runtimes can be defined by creating additional ones. Predictor - This represents a logical endpoint for serving predictions using a particular model. The Predictor spec specifies the model type, the storage in which it resides and the path to the model within that storage. The corresponding endpoint is \"stable\" and will seamlessly transition between different model versions or types when the spec is updated. The Pods that correspond to a particular ServingRuntime are started only if/when there are one or more defined Predictor s that require them. We have standardized on the KServe v2 data plane API for inferencing, this is supported for all of the built-in model types. Only the gRPC version of this API is supported in this version of ModelMesh Serving, REST support will be coming soon. Custom runtimes are free to use gRPC Service APIs for inferencing, including the KSv2 API. System-wide configuration parameters can be set by creating a ConfigMap with name model-serving-config . Components \u00b6 Core components \u00b6 ModelMesh Serving - Model serving controller ModelMesh - ModelMesh containers used for orchestrating model placement and routing Runtime Adapters \u00b6 modelmesh-runtime-adapter - the containers which run in each model serving pod and act as an intermediary between ModelMesh and third-party model-server containers. It also incorporates the \"puller\" logic that is responsible for retrieving the models from storage Model Serving Runtimes \u00b6 triton-inference-server - NVIDIA's Triton Inference Server seldon-mlserver - Python-based inference server openVINO-model-server - OpenVINO Model Server KServe integration \u00b6 Note that the integration of KServe with ModelMesh is still in an alpha stage and there are still features like explainers that do not yet work when deploying on ModelMesh. In any case, ModelMesh Serving supports deploying models using KServe's InferenceService interface . ModelMesh Serving also supports transformer use cases in which the transformers and predictors are separately deployed by KServe and ModelMesh controllers. An example of ModelMesh transformer can be found here . While ModelMesh Serving can handle both its original Predictor CRD and the KServe InferenceService CRD, there is work being done to eventually have both KServe and ModelMesh converge on the usage of InferenceService CRD. Install \u00b6 For installation instructions check out here . Learn more \u00b6 To learn more about ModelMesh, check out the documentation .","title":"ModelMesh Overview"},{"location":"modelserving/mms/modelmesh/overview/#modelmesh-serving","text":"Multi-model serving with ModelMesh is an alpha feature added recently to increase KServe\u2019s scalability. Please assume that the interface is subject to changes.","title":"ModelMesh Serving"},{"location":"modelserving/mms/modelmesh/overview/#overview","text":"ModelMesh Serving is a Kubernetes-based platform for realtime serving of ML/DL models, optimized for high volume/density use cases. Utilization of available system resources is maximized via intelligent management of in-memory model data across clusters of deployed Pods, based on usage of those models over time. Leveraging existing third-party model servers, a number of standard ML/DL model formats are supported out-of-the box with more to follow: TensorFlow, PyTorch ScriptModule, ONNX, scikit-learn, XGBoost, LightGBM, OpenVINO IR. It's also possible to extend with custom runtimes to support arbitrary model formats. The architecture comprises a controller Pod that orchestrates one or more Kubernetes \"model runtime\" Deployments which load/serve the models, and a Service that accepts inferencing requests. A routing layer spanning the runtime pods ensures that models are loaded in the right places at the right times and handles forwarding of those requests. The model data itself is pulled from one or more external storage instances which must be configured in a Secret. We currently support only S3-based object storage (self-managed storage is also an option for custom runtimes), but more options will be supported soon. ModelMesh Serving makes use of two core Kubernetes Custom Resource types: ServingRuntime - Templates for Pods that can serve one or more particular model formats. There are three \"built in\" runtimes that cover the out-of-the-box model types, custom runtimes can be defined by creating additional ones. Predictor - This represents a logical endpoint for serving predictions using a particular model. The Predictor spec specifies the model type, the storage in which it resides and the path to the model within that storage. The corresponding endpoint is \"stable\" and will seamlessly transition between different model versions or types when the spec is updated. The Pods that correspond to a particular ServingRuntime are started only if/when there are one or more defined Predictor s that require them. We have standardized on the KServe v2 data plane API for inferencing, this is supported for all of the built-in model types. Only the gRPC version of this API is supported in this version of ModelMesh Serving, REST support will be coming soon. Custom runtimes are free to use gRPC Service APIs for inferencing, including the KSv2 API. System-wide configuration parameters can be set by creating a ConfigMap with name model-serving-config .","title":"Overview"},{"location":"modelserving/mms/modelmesh/overview/#components","text":"","title":"Components"},{"location":"modelserving/mms/modelmesh/overview/#core-components","text":"ModelMesh Serving - Model serving controller ModelMesh - ModelMesh containers used for orchestrating model placement and routing","title":"Core components"},{"location":"modelserving/mms/modelmesh/overview/#runtime-adapters","text":"modelmesh-runtime-adapter - the containers which run in each model serving pod and act as an intermediary between ModelMesh and third-party model-server containers. It also incorporates the \"puller\" logic that is responsible for retrieving the models from storage","title":"Runtime Adapters"},{"location":"modelserving/mms/modelmesh/overview/#model-serving-runtimes","text":"triton-inference-server - NVIDIA's Triton Inference Server seldon-mlserver - Python-based inference server openVINO-model-server - OpenVINO Model Server","title":"Model Serving Runtimes"},{"location":"modelserving/mms/modelmesh/overview/#kserve-integration","text":"Note that the integration of KServe with ModelMesh is still in an alpha stage and there are still features like explainers that do not yet work when deploying on ModelMesh. In any case, ModelMesh Serving supports deploying models using KServe's InferenceService interface . ModelMesh Serving also supports transformer use cases in which the transformers and predictors are separately deployed by KServe and ModelMesh controllers. An example of ModelMesh transformer can be found here . While ModelMesh Serving can handle both its original Predictor CRD and the KServe InferenceService CRD, there is work being done to eventually have both KServe and ModelMesh converge on the usage of InferenceService CRD.","title":"KServe integration"},{"location":"modelserving/mms/modelmesh/overview/#install","text":"For installation instructions check out here .","title":"Install"},{"location":"modelserving/mms/modelmesh/overview/#learn-more","text":"To learn more about ModelMesh, check out the documentation .","title":"Learn more"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/","text":"InferenceService Node Scheduling \u00b6 Setup \u00b6 The InferenceService spec supports node selector, node affinity and tolerations. To enable these features we must enable the knative flags (see Install Knative Serving Note ). Option 1: Pre-Kubeflow Install Feature Flags Setup \u00b6 If we install KServe as part of Kubeflow manifest and would like to enable the feature flags before installing Kubeflow, we can do so by editing the file manifests/common/knative/knative-serving/base/upstream/serving-core.yaml This is often a common approach that allows a reproducible configuration as the feature flags will be enabled everytime we install Kubeflow. Enable kubernetes.podspec-affinity kubernetes.podspec-affinity : \"enabled\" Enable kubernetes.podspec-nodeselector kubernetes.podspec-nodeselector : \"enabled\" Enable kubernetes.podspec-tolerations kubernetes.podspec-tolerations : \"enabled\" With all features enabled we should have a data portion that looks like this : data : kubernetes.podspec-affinity : \"enabled\" kubernetes.podspec-nodeselector : \"enabled\" kubernetes.podspec-tolerations : \"enabled\" Option 2: Post-Kubeflow Install Feature Flags Setup \u00b6 If we don't want to enable the flags before installing kubeflow, we can enable the flags after installing kubeflow by editing the configuration using : kubectl edit configmap config-features -n knative-serving Simply add the flags in the data section like it was done for the pre-Kubeflow install setup. Usage \u00b6 To use node selector/node affinity and tolerations, we can use it directly in the InferenceService custom resource definition. Node Selector \u00b6 Here is an example using node selector where myLabelName can be replaced by the name of the label that the specific node we want has, same thing for myLabelValue . apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : nodeSelector : myLabelName : \"myLabelValue\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" Note that this also works on other pod spec like transformer , here is the equivalent for a transformer , we simply add it under the transformer spec : apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : transformer : nodeSelector : myLabelName : \"myLabelValue\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" GPU Node Label Selector Example \u00b6 In this example, our predictor will only run on the node with the label k8s.amazonaws.com/accelerator with the value \"nvidia-tesla-t4\" . You can learn more about recommended label names for GPU nodes when using kubernetes autoscaler by checking your cloud provider's documentation. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : nodeSelector : k8s.amazonaws.com/accelerator : \"nvidia-tesla-t4\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" resources : limits : nvidia.com/gpu : 1 requests : nvidia.com/gpu : 1 Tolerations \u00b6 This examples shows how to add a toleration to our predictor , this will make it possible (not mandatory) for the predictor pod to be scheduled on any node with the matching taint. You can replace yourTaintKeyHere with the taint key from your node taint. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : tolerations : - key : \"yourTaintKeyHere\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" This also works for other pod spec like transformer . apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : transformer : tolerations : - key : \"yourTaintKeyHere\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" Important Note On Tolerations for GPU Nodes \u00b6 It's important to use the conventional taint nvidia.com/gpu for NVIDIA GPU nodes because if we use a custom taint, the nvidia-device-plugin will not be able to be scheduled on the GPU node. Therefore our node would not be able to expose its GPUs to kubernetes making it a plain CPU only node. This would prevent us from scheduling any GPU workload on it. The nvidia-device-plugin automatically tolerates the nvidia.com/gpu taint, see this commit . Therefore by using this conventional taint, we ensure that the nvidia-device-plugin will work and allow our node to expose its GPUs. Using this taint on a GPU node also has the advantage that every pods scheduled on this GPU node will automatically have the toleration for this taint if it requests GPU resources. For instance, if we deploy an InferenceService with a predictor that requests 1 GPU, then kubernetes will detect a request of 1 GPU and add to the predictor pod the nvidia.com/gpu toleration automatically. If on the other hand, our predictor (or other pod spec like transformer ) does not request GPUs and has a node affinity/node selector for the GPU node then since the pod did not request GPUs, the toleration to nvidia.com/gpu will not be added to the pod. This is to prevent CPU only workload from preventing the GPU node to scale down for instance. Note that this feature of automatically adding toleration to pods requesting GPU resources is enabled via the ExtendedResourceToleration admission controller which was added in kubernetes 1.19. You can learn more about dedicated node pools and ExtendedResourceToleration admission controller here . Node Selector + Tolerations \u00b6 As described in the Overview we can combine node selector/node affinity and tolerations to force a pod to be scheduled on a node and to force a node to only accept pods with a matching toleration. Here is an exemple where we want our transformer to run on a node with the label myLabel1=true , we also want our transformer to tolerate nodes with the taint myTaint1 . We want our predictor to run on a node with the label myLabel2=true , we also want our predictor to tolerate nodes with the taint myTaint2 . apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : transformer : nodeSelector : myLabel1 : \"true\" tolerations : - key : \"myTaint1\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : nodeSelector : myLabel2 : \"true\" tolerations : - key : \"myTaint2\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" args : - --log-verbose=1 GPU Example \u00b6 This applies to other pod spec like transformer but if we want our predictor to run on a GPU node and if the predictor requests GPUs, then we should make sure our GPU node has the taint nvidia.com/gpu . As described earlier , this allows us to leverage kubernetes ExtendedResourceToleration and simply omit the toleration for our GPU pod given that we have a kubernetes version that supports it. The result is the same as before but we removed the toleration for the pod requesting GPUs (here the predictor ) : apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : transformer : nodeSelector : myLabel1 : \"true\" tolerations : - key : \"myTaint1\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : nodeSelector : myLabel2 : \"true\" triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" args : - --log-verbose=1 resources : limits : nvidia.com/gpu : 1 requests : nvidia.com/gpu : 1","title":"InferenceService Node Scheduling"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#inferenceservice-node-scheduling","text":"","title":"InferenceService Node Scheduling"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#setup","text":"The InferenceService spec supports node selector, node affinity and tolerations. To enable these features we must enable the knative flags (see Install Knative Serving Note ).","title":"Setup"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#option-1-pre-kubeflow-install-feature-flags-setup","text":"If we install KServe as part of Kubeflow manifest and would like to enable the feature flags before installing Kubeflow, we can do so by editing the file manifests/common/knative/knative-serving/base/upstream/serving-core.yaml This is often a common approach that allows a reproducible configuration as the feature flags will be enabled everytime we install Kubeflow. Enable kubernetes.podspec-affinity kubernetes.podspec-affinity : \"enabled\" Enable kubernetes.podspec-nodeselector kubernetes.podspec-nodeselector : \"enabled\" Enable kubernetes.podspec-tolerations kubernetes.podspec-tolerations : \"enabled\" With all features enabled we should have a data portion that looks like this : data : kubernetes.podspec-affinity : \"enabled\" kubernetes.podspec-nodeselector : \"enabled\" kubernetes.podspec-tolerations : \"enabled\"","title":"Option 1: Pre-Kubeflow Install Feature Flags Setup"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#option-2-post-kubeflow-install-feature-flags-setup","text":"If we don't want to enable the flags before installing kubeflow, we can enable the flags after installing kubeflow by editing the configuration using : kubectl edit configmap config-features -n knative-serving Simply add the flags in the data section like it was done for the pre-Kubeflow install setup.","title":"Option 2: Post-Kubeflow Install Feature Flags Setup"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#usage","text":"To use node selector/node affinity and tolerations, we can use it directly in the InferenceService custom resource definition.","title":"Usage"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#node-selector","text":"Here is an example using node selector where myLabelName can be replaced by the name of the label that the specific node we want has, same thing for myLabelValue . apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : nodeSelector : myLabelName : \"myLabelValue\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" Note that this also works on other pod spec like transformer , here is the equivalent for a transformer , we simply add it under the transformer spec : apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : transformer : nodeSelector : myLabelName : \"myLabelValue\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\"","title":"Node Selector"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#gpu-node-label-selector-example","text":"In this example, our predictor will only run on the node with the label k8s.amazonaws.com/accelerator with the value \"nvidia-tesla-t4\" . You can learn more about recommended label names for GPU nodes when using kubernetes autoscaler by checking your cloud provider's documentation. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : nodeSelector : k8s.amazonaws.com/accelerator : \"nvidia-tesla-t4\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" resources : limits : nvidia.com/gpu : 1 requests : nvidia.com/gpu : 1","title":"GPU Node Label Selector Example"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#tolerations","text":"This examples shows how to add a toleration to our predictor , this will make it possible (not mandatory) for the predictor pod to be scheduled on any node with the matching taint. You can replace yourTaintKeyHere with the taint key from your node taint. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : tolerations : - key : \"yourTaintKeyHere\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" This also works for other pod spec like transformer . apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : transformer : tolerations : - key : \"yourTaintKeyHere\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\"","title":"Tolerations"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#important-note-on-tolerations-for-gpu-nodes","text":"It's important to use the conventional taint nvidia.com/gpu for NVIDIA GPU nodes because if we use a custom taint, the nvidia-device-plugin will not be able to be scheduled on the GPU node. Therefore our node would not be able to expose its GPUs to kubernetes making it a plain CPU only node. This would prevent us from scheduling any GPU workload on it. The nvidia-device-plugin automatically tolerates the nvidia.com/gpu taint, see this commit . Therefore by using this conventional taint, we ensure that the nvidia-device-plugin will work and allow our node to expose its GPUs. Using this taint on a GPU node also has the advantage that every pods scheduled on this GPU node will automatically have the toleration for this taint if it requests GPU resources. For instance, if we deploy an InferenceService with a predictor that requests 1 GPU, then kubernetes will detect a request of 1 GPU and add to the predictor pod the nvidia.com/gpu toleration automatically. If on the other hand, our predictor (or other pod spec like transformer ) does not request GPUs and has a node affinity/node selector for the GPU node then since the pod did not request GPUs, the toleration to nvidia.com/gpu will not be added to the pod. This is to prevent CPU only workload from preventing the GPU node to scale down for instance. Note that this feature of automatically adding toleration to pods requesting GPU resources is enabled via the ExtendedResourceToleration admission controller which was added in kubernetes 1.19. You can learn more about dedicated node pools and ExtendedResourceToleration admission controller here .","title":"Important Note On Tolerations for GPU Nodes"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#node-selector-tolerations","text":"As described in the Overview we can combine node selector/node affinity and tolerations to force a pod to be scheduled on a node and to force a node to only accept pods with a matching toleration. Here is an exemple where we want our transformer to run on a node with the label myLabel1=true , we also want our transformer to tolerate nodes with the taint myTaint1 . We want our predictor to run on a node with the label myLabel2=true , we also want our predictor to tolerate nodes with the taint myTaint2 . apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : transformer : nodeSelector : myLabel1 : \"true\" tolerations : - key : \"myTaint1\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : nodeSelector : myLabel2 : \"true\" tolerations : - key : \"myTaint2\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" args : - --log-verbose=1","title":"Node Selector + Tolerations"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#gpu-example","text":"This applies to other pod spec like transformer but if we want our predictor to run on a GPU node and if the predictor requests GPUs, then we should make sure our GPU node has the taint nvidia.com/gpu . As described earlier , this allows us to leverage kubernetes ExtendedResourceToleration and simply omit the toleration for our GPU pod given that we have a kubernetes version that supports it. The result is the same as before but we removed the toleration for the pod requesting GPUs (here the predictor ) : apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : transformer : nodeSelector : myLabel1 : \"true\" tolerations : - key : \"myTaint1\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : nodeSelector : myLabel2 : \"true\" triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" args : - --log-verbose=1 resources : limits : nvidia.com/gpu : 1 requests : nvidia.com/gpu : 1","title":"GPU Example"},{"location":"modelserving/nodescheduling/overview/","text":"Overview \u00b6 This docs gives an overview of how node scheduling allows to schedule a predictor or a transformer on a specific node from you cluster. Use Cases \u00b6 To help illustrate when we would use node scheduling, here are some use cases that usually require the use of node scheduling : Run a transformer on a specific node with hardware that is better for pre/post-processing (eg: CPU only node if our processing is CPU-bounded). Run a predictor on a specific node with hardware that is better for inference (eg: GPU node for heavy CNN models). Allow a cluster autoscaler to scale down a node back down to zero after the inference. For instance, if we need to run the predictor on a costly GPU or CPU node but would also like to scale down the node to zero after the inference, then this is possible. To enable such scale down, we need to use a combination of pod scale down to zero for the predictor pod and node scheduling to ensure only our predictor is scheduled on this node. As a result, once the inference is done, the predictor pod will scale down to zero and since we used node scheduling, only authorized pods were scheduled on this node therefore if only our predictor pod was allowed and now it is scaled down to zero, it means there are no more pods running on this node and this allows the autoscaler to scale down the node to zero as well. Prerequisite Knowledge \u00b6 Node Selector/Node Affinity \u00b6 In order to achieve node scheduling, KServe leverages kubernetes ability to constrain a pod to a node (see Assigning Pods to Node ). Using node selector or node affinity we can make sure a pod (eg: predictor pod), can only run on a particular node. Node selector provides a very simple way to constrain pods to nodes with particular labels. Node affinity is conceptually similar to node selector but node affinity has more advance features that are described in kubernetes documentation, feel free to use whichever one satisfies your needs. You can think of node selector/node affinity as a way to ensure a pod is only scheduled on a node with a given label for instance. Here is a diagram that represents a simple relationship when using node selector or node affinity. We can see that pod 1 has a node selector/node affinity, it only wants to run on node 1. Pod 2 does not have any node selector/node affinity, it accepts to run on any node. You can learn more about node selector and node affinity from this quick video . Taints & Tolerations \u00b6 Having node selector/node affinity is great to ensure a pod runs only on a node, but what if we don't want other pods to also run on this node? We could add pod anti-affinity to every other pods but this quickly becomes hard to maintain. This is where kubernetes taints & tolerations comes into play (see Taints and Tolerations ). This feature is leveraged by KServe and it allows us to completely isolate a node so that only pods that are \"authorized\" or more precisely, that have the required toleration can run on it. Taints \u00b6 We can use a taint on a node to specify that only pods with a toleration to this taint can run on this node. This can be illustrated as follow : Here we can see that node 1 has a taint while node 2 and node 3 have no taint. Since no pod has a toleration that matches node 1's taint, no pod can be scheduled on node 1. Tolerations \u00b6 Now if we add a toleration to pod 1 that matches the taint from node 1, then this makes it possible (not mandatory) for pod 1 to be scheduled on node 1. It does not prevent pod 1 from being scheduled on other nodes but because of the toleration to node 1's taint, it makes it possible for pod 1 to also be scheduled on node 1. Note that pod 2 is still restricted to being scheduled on node 2 and 3 since it does not have the toleration that is required to be scheduled on node 1. You can learn more about taints and tolerations from this quick video . Putting It All Together \u00b6 We can combine node selector/node affinity with taints and tolerations to force a pod to only run on a node (via node selector/node affinity) and we can force this node to only accept pods with a specific toleration (via taints & tolerations). The result is as follow : Pod 1 can only be scheduled on node 1, pod 1 has a toleration to node 1's taint and node 1 only accepts pods that have the required toleration to its taint. Pod 2 accepts to be scheduled on any node. Here node 1 only accepts pods with the required toleration therefore pod 2 cannot be scheduled on node 1. Since pod 2 does not have any node affinity, it accepts to be scheduled on any of the remaining nodes, so node 2 or node 3.","title":"Overview"},{"location":"modelserving/nodescheduling/overview/#overview","text":"This docs gives an overview of how node scheduling allows to schedule a predictor or a transformer on a specific node from you cluster.","title":"Overview"},{"location":"modelserving/nodescheduling/overview/#use-cases","text":"To help illustrate when we would use node scheduling, here are some use cases that usually require the use of node scheduling : Run a transformer on a specific node with hardware that is better for pre/post-processing (eg: CPU only node if our processing is CPU-bounded). Run a predictor on a specific node with hardware that is better for inference (eg: GPU node for heavy CNN models). Allow a cluster autoscaler to scale down a node back down to zero after the inference. For instance, if we need to run the predictor on a costly GPU or CPU node but would also like to scale down the node to zero after the inference, then this is possible. To enable such scale down, we need to use a combination of pod scale down to zero for the predictor pod and node scheduling to ensure only our predictor is scheduled on this node. As a result, once the inference is done, the predictor pod will scale down to zero and since we used node scheduling, only authorized pods were scheduled on this node therefore if only our predictor pod was allowed and now it is scaled down to zero, it means there are no more pods running on this node and this allows the autoscaler to scale down the node to zero as well.","title":"Use Cases"},{"location":"modelserving/nodescheduling/overview/#prerequisite-knowledge","text":"","title":"Prerequisite Knowledge"},{"location":"modelserving/nodescheduling/overview/#node-selectornode-affinity","text":"In order to achieve node scheduling, KServe leverages kubernetes ability to constrain a pod to a node (see Assigning Pods to Node ). Using node selector or node affinity we can make sure a pod (eg: predictor pod), can only run on a particular node. Node selector provides a very simple way to constrain pods to nodes with particular labels. Node affinity is conceptually similar to node selector but node affinity has more advance features that are described in kubernetes documentation, feel free to use whichever one satisfies your needs. You can think of node selector/node affinity as a way to ensure a pod is only scheduled on a node with a given label for instance. Here is a diagram that represents a simple relationship when using node selector or node affinity. We can see that pod 1 has a node selector/node affinity, it only wants to run on node 1. Pod 2 does not have any node selector/node affinity, it accepts to run on any node. You can learn more about node selector and node affinity from this quick video .","title":"Node Selector/Node Affinity"},{"location":"modelserving/nodescheduling/overview/#taints-tolerations","text":"Having node selector/node affinity is great to ensure a pod runs only on a node, but what if we don't want other pods to also run on this node? We could add pod anti-affinity to every other pods but this quickly becomes hard to maintain. This is where kubernetes taints & tolerations comes into play (see Taints and Tolerations ). This feature is leveraged by KServe and it allows us to completely isolate a node so that only pods that are \"authorized\" or more precisely, that have the required toleration can run on it.","title":"Taints &amp; Tolerations"},{"location":"modelserving/nodescheduling/overview/#taints","text":"We can use a taint on a node to specify that only pods with a toleration to this taint can run on this node. This can be illustrated as follow : Here we can see that node 1 has a taint while node 2 and node 3 have no taint. Since no pod has a toleration that matches node 1's taint, no pod can be scheduled on node 1.","title":"Taints"},{"location":"modelserving/nodescheduling/overview/#tolerations","text":"Now if we add a toleration to pod 1 that matches the taint from node 1, then this makes it possible (not mandatory) for pod 1 to be scheduled on node 1. It does not prevent pod 1 from being scheduled on other nodes but because of the toleration to node 1's taint, it makes it possible for pod 1 to also be scheduled on node 1. Note that pod 2 is still restricted to being scheduled on node 2 and 3 since it does not have the toleration that is required to be scheduled on node 1. You can learn more about taints and tolerations from this quick video .","title":"Tolerations"},{"location":"modelserving/nodescheduling/overview/#putting-it-all-together","text":"We can combine node selector/node affinity with taints and tolerations to force a pod to only run on a node (via node selector/node affinity) and we can force this node to only accept pods with a specific toleration (via taints & tolerations). The result is as follow : Pod 1 can only be scheduled on node 1, pod 1 has a toleration to node 1's taint and node 1 only accepts pods that have the required toleration to its taint. Pod 2 accepts to be scheduled on any node. Here node 1 only accepts pods with the required toleration therefore pod 2 cannot be scheduled on node 1. Since pod 2 does not have any node affinity, it accepts to be scheduled on any of the remaining nodes, so node 2 or node 3.","title":"Putting It All Together"},{"location":"modelserving/observability/grafana_dashboards/","text":"Grafana Dashboards \u00b6 Some example Grafana dashboards are available in GrafanaLabs. Knative HTTP Dashboard (if using serverless mode) \u00b6 The Knative HTTP Grafana dasbhoard was built from Knative's sandbox monitoring example . KServe ModelServer Latency Dashboard \u00b6 A template dashboard for KServe ModelServer Latency contains example queries using the prometheus metrics for pre/post-process, predict and explain in milliseconds. The query is a histogram quantile . A fifth graph shows the total number of requests to the predict endpoint. This graph covers all KServe's ModelServer runtimes - lgbserver, paddleserver, pmmlserver, sklearnserver, xgbserver, custom transformer/predictor. KServe TorchServe Latency Dashboard \u00b6 A template dashboard for KServe TorchServe Latency contains an inference latency graph which plots the rate of the TorchServe metric ts_inference_latency_microseconds in milliseconds. A second graph plots the rate of TorchServe's internal queue latency metric ts_queue_latency_microseconds in milliseconds. A third graph plots the total requests to the TorchServe Inference Service. For more information see the TorchServe metrics doc . KServe Triton Latency Dashboard \u00b6 A template dashboard for KServe Triton Latency contains five latency graphs with the rate of Triton's input (preprocess), infer (predict), output (postprocess), internal queue and total latency metrics plotted in milliseconds. Triton outputs metrics on GPU usage as well, and the template plots a gauge of the percentage of GPU memory usage in bytes. For more information see the Triton Inference Server docs . Debugging Performance \u00b6 With these Grafana dashboards set up, debug latency issues with the following steps First, (if in serverless mode) start with the Knative HTTP Dashboard to check if there is a queueing delay with queue-proxy compare the gateway latency percentile metrics with your target SLO check the observed concurrency metrics to see if your service is overloaded with a high number of inflight requests, indicating the service is over capacity and is unable to keep up with the number of requests check the GPU/CPU memory metrics to see if the service is close to its limits - if your service has a high number of inflight requests/high CPU/GPU usage, then a possible solution is to add more resources/replicas Next, take a look at the appropriate serving runtime dashboard to see if there is a bottleneck in the code check the latencies for pre/post-process, predict, explain - are latencies higher than expected at any one step? If so, you may need to make changes/adjustments for this step (note: TorchServe does not currently expose this level of observability at the moment, only an inference latency graph which encompasses the steps together) check the queue latency metrics (TorchServe and Triton) - if requests are stuck in the queue, the model is not able to keep up with the number of requests, consider adding more resources/replicas (Triton) check the GPU utilization metrics to see if your service is at capacity and you need more GPU resources If the numbers from the dashboards meet your SLO, check client side metrics to investigate if it is causing additional network latency.","title":"Grafana Dashboards"},{"location":"modelserving/observability/grafana_dashboards/#grafana-dashboards","text":"Some example Grafana dashboards are available in GrafanaLabs.","title":"Grafana Dashboards"},{"location":"modelserving/observability/grafana_dashboards/#knative-http-dashboard-if-using-serverless-mode","text":"The Knative HTTP Grafana dasbhoard was built from Knative's sandbox monitoring example .","title":"Knative HTTP Dashboard (if using serverless mode)"},{"location":"modelserving/observability/grafana_dashboards/#kserve-modelserver-latency-dashboard","text":"A template dashboard for KServe ModelServer Latency contains example queries using the prometheus metrics for pre/post-process, predict and explain in milliseconds. The query is a histogram quantile . A fifth graph shows the total number of requests to the predict endpoint. This graph covers all KServe's ModelServer runtimes - lgbserver, paddleserver, pmmlserver, sklearnserver, xgbserver, custom transformer/predictor.","title":"KServe ModelServer Latency Dashboard"},{"location":"modelserving/observability/grafana_dashboards/#kserve-torchserve-latency-dashboard","text":"A template dashboard for KServe TorchServe Latency contains an inference latency graph which plots the rate of the TorchServe metric ts_inference_latency_microseconds in milliseconds. A second graph plots the rate of TorchServe's internal queue latency metric ts_queue_latency_microseconds in milliseconds. A third graph plots the total requests to the TorchServe Inference Service. For more information see the TorchServe metrics doc .","title":"KServe TorchServe Latency Dashboard"},{"location":"modelserving/observability/grafana_dashboards/#kserve-triton-latency-dashboard","text":"A template dashboard for KServe Triton Latency contains five latency graphs with the rate of Triton's input (preprocess), infer (predict), output (postprocess), internal queue and total latency metrics plotted in milliseconds. Triton outputs metrics on GPU usage as well, and the template plots a gauge of the percentage of GPU memory usage in bytes. For more information see the Triton Inference Server docs .","title":"KServe Triton Latency Dashboard"},{"location":"modelserving/observability/grafana_dashboards/#debugging-performance","text":"With these Grafana dashboards set up, debug latency issues with the following steps First, (if in serverless mode) start with the Knative HTTP Dashboard to check if there is a queueing delay with queue-proxy compare the gateway latency percentile metrics with your target SLO check the observed concurrency metrics to see if your service is overloaded with a high number of inflight requests, indicating the service is over capacity and is unable to keep up with the number of requests check the GPU/CPU memory metrics to see if the service is close to its limits - if your service has a high number of inflight requests/high CPU/GPU usage, then a possible solution is to add more resources/replicas Next, take a look at the appropriate serving runtime dashboard to see if there is a bottleneck in the code check the latencies for pre/post-process, predict, explain - are latencies higher than expected at any one step? If so, you may need to make changes/adjustments for this step (note: TorchServe does not currently expose this level of observability at the moment, only an inference latency graph which encompasses the steps together) check the queue latency metrics (TorchServe and Triton) - if requests are stuck in the queue, the model is not able to keep up with the number of requests, consider adding more resources/replicas (Triton) check the GPU utilization metrics to see if your service is at capacity and you need more GPU resources If the numbers from the dashboards meet your SLO, check client side metrics to investigate if it is causing additional network latency.","title":"Debugging Performance"},{"location":"modelserving/observability/prometheus_metrics/","text":"Prometheus Metrics \u00b6 Exposing a Prometheus metrics port \u00b6 All supported serving runtimes support exporting prometheus metrics on a specified port in the inference service's pod. The appropriate port for the model server is defined in the kserve/config/runtimes YAML files. For example, torchserve defines its prometheus port as 8082 in kserve-torchserve.yaml . metadata : name : kserve-torchserve spec : annotations : prometheus.kserve.io/port : '8082' prometheus.kserve.io/path : \"/metrics\" If needed, this value can be overridden in the InferenceService YAML. To enable prometheus metrics, add the annotation serving.kserve.io/enable-prometheus-scraping to the InferenceService YAML. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-irisv2\" annotations : serving.kserve.io/enable-prometheus-scraping : \"true\" spec : predictor : sklearn : protocolVersion : v2 storageUri : \"gs://seldon-models/sklearn/iris\" The default values for serving.kserve.io/enable-prometheus-scraping can be set in the inferenceservice-config configmap. See the docs for more info. There is not currently a unified set of metrics exported by the model servers. Each model server may implement its own set of metrics to export. Note This annotation defines the prometheus port and path, but it does not trigger the prometheus to scrape. Users must configure prometheus to scrape data from inference service's pod according to the prometheus settings. Metrics for lgbserver, paddleserver, pmmlserver, sklearnserver, xgbserver, custom transformer/predictor \u00b6 Prometheus latency histograms are emitted for each of the steps (pre/postprocessing, explain, predict). Additionally, the latencies of each step are logged per request. See also modelserver prometheus label definitions and metric implementation . Metric Name Description Type request_preprocess_seconds pre-processing request latency Histogram request_explain_seconds explain request latency Histogram request_predict_seconds prediction request latency Histogram request_postprocess_seconds pre-processing request latency Histogram Other serving runtime metrics \u00b6 Some model servers define their own metrics. mlserver torchserve triton tensorflow (Please see Github Issue #2462 ) Exporting metrics \u00b6 Exporting metrics in serverless mode requires that the queue-proxy extension image is used. For more information on how to export metrics, see Queue Proxy Extension documentation. Knative/Queue-Proxy metrics \u00b6 Queue proxy emits metrics be default on port 9091. If aggregation metrics are set up with the queue proxy extension, the default port for the aggregated metrics will be 9088. See the Knative documentation (and additional metrics defined in the code ) for more information about the metrics queue-proxy exposes.","title":"Prometheus Metrics"},{"location":"modelserving/observability/prometheus_metrics/#prometheus-metrics","text":"","title":"Prometheus Metrics"},{"location":"modelserving/observability/prometheus_metrics/#exposing-a-prometheus-metrics-port","text":"All supported serving runtimes support exporting prometheus metrics on a specified port in the inference service's pod. The appropriate port for the model server is defined in the kserve/config/runtimes YAML files. For example, torchserve defines its prometheus port as 8082 in kserve-torchserve.yaml . metadata : name : kserve-torchserve spec : annotations : prometheus.kserve.io/port : '8082' prometheus.kserve.io/path : \"/metrics\" If needed, this value can be overridden in the InferenceService YAML. To enable prometheus metrics, add the annotation serving.kserve.io/enable-prometheus-scraping to the InferenceService YAML. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-irisv2\" annotations : serving.kserve.io/enable-prometheus-scraping : \"true\" spec : predictor : sklearn : protocolVersion : v2 storageUri : \"gs://seldon-models/sklearn/iris\" The default values for serving.kserve.io/enable-prometheus-scraping can be set in the inferenceservice-config configmap. See the docs for more info. There is not currently a unified set of metrics exported by the model servers. Each model server may implement its own set of metrics to export. Note This annotation defines the prometheus port and path, but it does not trigger the prometheus to scrape. Users must configure prometheus to scrape data from inference service's pod according to the prometheus settings.","title":"Exposing a Prometheus metrics port"},{"location":"modelserving/observability/prometheus_metrics/#metrics-for-lgbserver-paddleserver-pmmlserver-sklearnserver-xgbserver-custom-transformerpredictor","text":"Prometheus latency histograms are emitted for each of the steps (pre/postprocessing, explain, predict). Additionally, the latencies of each step are logged per request. See also modelserver prometheus label definitions and metric implementation . Metric Name Description Type request_preprocess_seconds pre-processing request latency Histogram request_explain_seconds explain request latency Histogram request_predict_seconds prediction request latency Histogram request_postprocess_seconds pre-processing request latency Histogram","title":"Metrics for lgbserver, paddleserver, pmmlserver, sklearnserver, xgbserver, custom transformer/predictor"},{"location":"modelserving/observability/prometheus_metrics/#other-serving-runtime-metrics","text":"Some model servers define their own metrics. mlserver torchserve triton tensorflow (Please see Github Issue #2462 )","title":"Other serving runtime metrics"},{"location":"modelserving/observability/prometheus_metrics/#exporting-metrics","text":"Exporting metrics in serverless mode requires that the queue-proxy extension image is used. For more information on how to export metrics, see Queue Proxy Extension documentation.","title":"Exporting metrics"},{"location":"modelserving/observability/prometheus_metrics/#knativequeue-proxy-metrics","text":"Queue proxy emits metrics be default on port 9091. If aggregation metrics are set up with the queue proxy extension, the default port for the aggregated metrics will be 9088. See the Knative documentation (and additional metrics defined in the code ) for more information about the metrics queue-proxy exposes.","title":"Knative/Queue-Proxy metrics"},{"location":"modelserving/storage/oci/","text":"Serving models with OCI images \u00b6 KServe's traditional approach for model initialization involves fetching models from sources like S3 buckets or URIs . This process is adequate for small models but becomes a bottleneck for larger ones like used for large language models, significantly delaying startup times in auto-scaling scenarios. \"Modelcars\" is a KServe feature designed to address these challenges. It streamlines model fetching using OCI images, offering several advantages: Reduced Startup Times: By avoiding repetitive downloads of large models, startup delays are significantly minimized. Lower Disk Space Usage: The feature decreases the need for duplicated local storage, conserving disk space. Enhanced Performance: Modelcars allows for advanced techniques like pre-fetching images and lazy-loading, improving efficiency. Compatibility and Integration: It seamlessly integrates with existing KServe infrastructure, ensuring ease of adoption. Modelcars represents a step forward in efficient model serving, particularly beneficial for handling large models and dynamic serving environments. Enabling Modelcars \u00b6 Modelcars is an experimental feature in KServe and is not enabled by default. To take advantage of this new model serving method, it needs to be activated in the KServe configuration. Follow the steps below to enable Modelcars in your environment. Note Modelcars are currently in an experimental phase. Enable this feature in a test environment first to ensure it meets your requirements before using it in a production setting. Modelcars can be enabled by modifying the storageInitializer configuration in the inferenceservice-config ConfigMap. This can be done manually using kubectl edit or by executing the script provided below, with the current namespace set to the namespace where the kserve-controller-manager is installed (depends on the way how KServer is installed.) # Script to enable Modelcars # Fetch the current storageInitializer configuration config = $( kubectl get configmap inferenceservice-config -n kserve -o jsonpath = '{.data.storageInitializer}' ) # Enable modelcars and set the UID for the containers to run (required for minikube) newValue = $( echo $config | jq -c '. + {\"enableModelcar\": true, \"uidModelcar\": 1010}' ) # Create a temporary directory for the patch file tmpdir = $( mktemp -d ) cat <<EOT > $tmpdir/patch.txt [{ \"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": '$newValue' }] EOT # Apply the patch to the ConfigMap kubectl patch configmap -n kserve inferenceservice-config --type = json --patch-file = $tmpdir /patch.txt # Restart the KServe controller to apply changes kubectl delete pod -n kserve -l control-plane = kserve-controller-manager Prepare an OCI Image with Model Data \u00b6 To utilize Modelcars for serving models, you need to prepare an OCI (Open Container Initiative) image containing your model data. This process involves creating a Dockerfile and building an OCI image that houses your model in a specific directory. Below are the steps and an example to guide you through this process. Create a Dockerfile: Start by creating a Dockerfile that uses a base image containing the necessary commands like ln (for creating symbolic links) and sleep (for keeping the container running). The Dockerfile should also include steps to create a directory /model for your model data and copy the data into this directory. Here's an example Dockerfile where the data/ directory contains your model data. This data will later be mounted in /mnt/models by the runtime: FROM busybox RUN mkdir /models && chmod 775 /models COPY data/ /models/ Build and Push the Image to a Registry : Once your Dockerfile is ready, use either docker or podman to build and push the image to a container registry like Docker Hub or quay.io docker build -t myuser/mymodel:1.0 . docker push myuser/mymodel:1.0 By completing these steps, you'll have an OCI image ready with your model data, which can then be used with the Modelcars feature in KServe for efficient model serving. Using Modelcars \u00b6 With Modelcars enabled and your OCI image containing the model data prepared, integrating this setup into your InferenceService is straightforward. The key step involves specifying the storageURI with the oci:// schema in your InferenceService configuration to point to your OCI image. Here\u2019s an example of how an InferenceService configuration would look when using the Modelcars feature: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : my-inference-service spec : predictor : model : modelFormat : name : sklearn storageUri : oci://myuser/mymodel:1.0 In order to fully leverage the local caching capabilities of OCI images in the Modelcars setup, it is crucial to use a specific tag for your model image, rather than relying on the default latest tag. For instance, in the provided example, the tag 1.0 is utilized. This approach ensures that the modelcar image is pulled with a IfNotPresent policy, allowing for efficient use of local cache. On the other hand, using the latest tag, or omitting a tag altogether, defaults to a Always pull policy. This means the image would be re-downloaded every time a Pod restarts or scales up, negating the benefits of local caching and potentially leading to increased startup times. Example \u00b6 Let's see how modecars work by deploying the getting started example by using an OCI image and check how it is different to the startup with a storage-initalizer init-container. Asuming you have setup a namespace kserve-test that is KServe enabled, create an InferenceService that uses an oci:// storage URL: kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris-oci\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"oci://rhuss/kserving-example-sklearn:1.0\" EOF After the InferenceService has been deployed successfully, you can follow the steps of the getting started example to verify the installation. Finally, let's have a brief look under the covers for how this feature works. Let's first check the runtime pod: kubectl get pods Sample Output NAME READY STATUS RESTARTS AGE sklearn-iris-oci-predictor-00001-deployment-58fc6564d7 3 /3 Running 1 ( 39m ago ) 40m As you can see, the Pod has now one additional container. This container is running the modelcar image and runs a ln -s /proc/$$/root/models /mnt/ command to create a symbolic link on a shared empty volume that is mounted on /mnt in the modelcar container and the serving runtime container. The magic here is the symbolic link over proc filesystem, which is shared among all containers. This is possible on Kubernetes for the container's of a Pod if the field .spec.shareProcessNamespace is set to true , which is the case for all storageUri that leverages the oci:// schema. Let's jump into the runtime container and examine the mounted /mnt filesystem: # InferenceService Pod pod = $( kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris-oci -o name ) # Verify that shareProcessNamespace is enabled kubectl get $pod -o jsonpath = \"{.spec.shareProcessNamespace}\" # Jump into pod and check the model location kubectl exec -it $pod -c kserve-container -- bash Sample in-container session sklearn-iris-oci-predictor:/$ cd /mnt sklearn-iris-oci-predictor:/mnt$ ls -l total 0 lrwxrwxrwx 1 1010 root 20 Jan 27 10 :35 models -> /proc/38/root/models sklearn-iris-oci-predictor:/mnt$ cd /mnt/models sklearn-iris-oci-predictor:/mnt/models$ ls -l total 8 -rw-r--r-- 1 root root 5408 Jan 26 15 :58 model.joblib As you can see, the runtime can directly access the data coming from the modelcar image, without prior copying it over in another volume. Configuration \u00b6 Fine-tuning the behavior of Modelcars in KServe is possible through global configuration settings for inference services. These settings are located in the inferenceservice-config ConfigMap, which resides in the kserve namespace or the namespace where the KServe controller operates. This ConfigMap includes various subconfigurations, with the Modelcars configuration located under the storageInitializer entry. To view the current configuration, use the following command: kubectl get cm -n kserve inferenceservice-config --jsonpath \"{.data.storageInitializer}\" Sample Output { \"image\" : \"kserve/storage-initializer:v0.11.2\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"enableDirectPvcVolumeMount\" : false , \"enableModelcar\" : true , \"uidModelcar\" : 1010 } The output is a JSON string representing the configuration. For Modelcars, several keys are available for customization: Key Description Example enableModelcar Enables direct access to an OCI container image using a source URL with an \"oci://\" schema. true cpuModelcar CPU request and limit for the modelcar container. 10m memoryModelcar Memory request and limit for the modelcar container. 15Mi uidModelcar UID under which the modelcar process and the main container run. Set to 0 for root if needed. If not set, the UID of the containers is used. 1042 References \u00b6 Modelcar Design document Original GitHub issue (discusses also some alternative solutions) 12-minute demo Code walkthrough showing the implementation of Modelcars in KServe (for background information)","title":"OCI"},{"location":"modelserving/storage/oci/#serving-models-with-oci-images","text":"KServe's traditional approach for model initialization involves fetching models from sources like S3 buckets or URIs . This process is adequate for small models but becomes a bottleneck for larger ones like used for large language models, significantly delaying startup times in auto-scaling scenarios. \"Modelcars\" is a KServe feature designed to address these challenges. It streamlines model fetching using OCI images, offering several advantages: Reduced Startup Times: By avoiding repetitive downloads of large models, startup delays are significantly minimized. Lower Disk Space Usage: The feature decreases the need for duplicated local storage, conserving disk space. Enhanced Performance: Modelcars allows for advanced techniques like pre-fetching images and lazy-loading, improving efficiency. Compatibility and Integration: It seamlessly integrates with existing KServe infrastructure, ensuring ease of adoption. Modelcars represents a step forward in efficient model serving, particularly beneficial for handling large models and dynamic serving environments.","title":"Serving models with OCI images"},{"location":"modelserving/storage/oci/#enabling-modelcars","text":"Modelcars is an experimental feature in KServe and is not enabled by default. To take advantage of this new model serving method, it needs to be activated in the KServe configuration. Follow the steps below to enable Modelcars in your environment. Note Modelcars are currently in an experimental phase. Enable this feature in a test environment first to ensure it meets your requirements before using it in a production setting. Modelcars can be enabled by modifying the storageInitializer configuration in the inferenceservice-config ConfigMap. This can be done manually using kubectl edit or by executing the script provided below, with the current namespace set to the namespace where the kserve-controller-manager is installed (depends on the way how KServer is installed.) # Script to enable Modelcars # Fetch the current storageInitializer configuration config = $( kubectl get configmap inferenceservice-config -n kserve -o jsonpath = '{.data.storageInitializer}' ) # Enable modelcars and set the UID for the containers to run (required for minikube) newValue = $( echo $config | jq -c '. + {\"enableModelcar\": true, \"uidModelcar\": 1010}' ) # Create a temporary directory for the patch file tmpdir = $( mktemp -d ) cat <<EOT > $tmpdir/patch.txt [{ \"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": '$newValue' }] EOT # Apply the patch to the ConfigMap kubectl patch configmap -n kserve inferenceservice-config --type = json --patch-file = $tmpdir /patch.txt # Restart the KServe controller to apply changes kubectl delete pod -n kserve -l control-plane = kserve-controller-manager","title":"Enabling Modelcars"},{"location":"modelserving/storage/oci/#prepare-an-oci-image-with-model-data","text":"To utilize Modelcars for serving models, you need to prepare an OCI (Open Container Initiative) image containing your model data. This process involves creating a Dockerfile and building an OCI image that houses your model in a specific directory. Below are the steps and an example to guide you through this process. Create a Dockerfile: Start by creating a Dockerfile that uses a base image containing the necessary commands like ln (for creating symbolic links) and sleep (for keeping the container running). The Dockerfile should also include steps to create a directory /model for your model data and copy the data into this directory. Here's an example Dockerfile where the data/ directory contains your model data. This data will later be mounted in /mnt/models by the runtime: FROM busybox RUN mkdir /models && chmod 775 /models COPY data/ /models/ Build and Push the Image to a Registry : Once your Dockerfile is ready, use either docker or podman to build and push the image to a container registry like Docker Hub or quay.io docker build -t myuser/mymodel:1.0 . docker push myuser/mymodel:1.0 By completing these steps, you'll have an OCI image ready with your model data, which can then be used with the Modelcars feature in KServe for efficient model serving.","title":"Prepare an OCI Image with Model Data"},{"location":"modelserving/storage/oci/#using-modelcars","text":"With Modelcars enabled and your OCI image containing the model data prepared, integrating this setup into your InferenceService is straightforward. The key step involves specifying the storageURI with the oci:// schema in your InferenceService configuration to point to your OCI image. Here\u2019s an example of how an InferenceService configuration would look when using the Modelcars feature: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : my-inference-service spec : predictor : model : modelFormat : name : sklearn storageUri : oci://myuser/mymodel:1.0 In order to fully leverage the local caching capabilities of OCI images in the Modelcars setup, it is crucial to use a specific tag for your model image, rather than relying on the default latest tag. For instance, in the provided example, the tag 1.0 is utilized. This approach ensures that the modelcar image is pulled with a IfNotPresent policy, allowing for efficient use of local cache. On the other hand, using the latest tag, or omitting a tag altogether, defaults to a Always pull policy. This means the image would be re-downloaded every time a Pod restarts or scales up, negating the benefits of local caching and potentially leading to increased startup times.","title":"Using Modelcars"},{"location":"modelserving/storage/oci/#example","text":"Let's see how modecars work by deploying the getting started example by using an OCI image and check how it is different to the startup with a storage-initalizer init-container. Asuming you have setup a namespace kserve-test that is KServe enabled, create an InferenceService that uses an oci:// storage URL: kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris-oci\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"oci://rhuss/kserving-example-sklearn:1.0\" EOF After the InferenceService has been deployed successfully, you can follow the steps of the getting started example to verify the installation. Finally, let's have a brief look under the covers for how this feature works. Let's first check the runtime pod: kubectl get pods Sample Output NAME READY STATUS RESTARTS AGE sklearn-iris-oci-predictor-00001-deployment-58fc6564d7 3 /3 Running 1 ( 39m ago ) 40m As you can see, the Pod has now one additional container. This container is running the modelcar image and runs a ln -s /proc/$$/root/models /mnt/ command to create a symbolic link on a shared empty volume that is mounted on /mnt in the modelcar container and the serving runtime container. The magic here is the symbolic link over proc filesystem, which is shared among all containers. This is possible on Kubernetes for the container's of a Pod if the field .spec.shareProcessNamespace is set to true , which is the case for all storageUri that leverages the oci:// schema. Let's jump into the runtime container and examine the mounted /mnt filesystem: # InferenceService Pod pod = $( kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris-oci -o name ) # Verify that shareProcessNamespace is enabled kubectl get $pod -o jsonpath = \"{.spec.shareProcessNamespace}\" # Jump into pod and check the model location kubectl exec -it $pod -c kserve-container -- bash Sample in-container session sklearn-iris-oci-predictor:/$ cd /mnt sklearn-iris-oci-predictor:/mnt$ ls -l total 0 lrwxrwxrwx 1 1010 root 20 Jan 27 10 :35 models -> /proc/38/root/models sklearn-iris-oci-predictor:/mnt$ cd /mnt/models sklearn-iris-oci-predictor:/mnt/models$ ls -l total 8 -rw-r--r-- 1 root root 5408 Jan 26 15 :58 model.joblib As you can see, the runtime can directly access the data coming from the modelcar image, without prior copying it over in another volume.","title":"Example"},{"location":"modelserving/storage/oci/#configuration","text":"Fine-tuning the behavior of Modelcars in KServe is possible through global configuration settings for inference services. These settings are located in the inferenceservice-config ConfigMap, which resides in the kserve namespace or the namespace where the KServe controller operates. This ConfigMap includes various subconfigurations, with the Modelcars configuration located under the storageInitializer entry. To view the current configuration, use the following command: kubectl get cm -n kserve inferenceservice-config --jsonpath \"{.data.storageInitializer}\" Sample Output { \"image\" : \"kserve/storage-initializer:v0.11.2\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"enableDirectPvcVolumeMount\" : false , \"enableModelcar\" : true , \"uidModelcar\" : 1010 } The output is a JSON string representing the configuration. For Modelcars, several keys are available for customization: Key Description Example enableModelcar Enables direct access to an OCI container image using a source URL with an \"oci://\" schema. true cpuModelcar CPU request and limit for the modelcar container. 10m memoryModelcar Memory request and limit for the modelcar container. 15Mi uidModelcar UID under which the modelcar process and the main container run. Set to 0 for root if needed. If not set, the UID of the containers is used. 1042","title":"Configuration"},{"location":"modelserving/storage/oci/#references","text":"Modelcar Design document Original GitHub issue (discusses also some alternative solutions) 12-minute demo Code walkthrough showing the implementation of Modelcars in KServe (for background information)","title":"References"},{"location":"modelserving/storage/storagecontainers/","text":"Storage Containers \u00b6 KServe downloads models using a storage initializer (initContainer). For example, this is the default storage initializer implementation . KServe introduced ClusterStorageContainer CRD in 0.11 which allows users to specify a custom container spec for a list of supported URI formats. A ClusterStorageContainer defines the container spec for one or more storage URI formats. Here is an example of a ClusterStorageContainer that corresponds to the default storage initializer. Note that this is incluced in the helm chart . apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/storage-initializer:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : gs:// - prefix : s3:// - prefix : hdfs:// - prefix : webhdfs:// - regex : \"https://(.+?).blob.core.windows.net/(.+)\" - regex : \"https://(.+?).file.core.windows.net/(.+)\" - regex : \"https?://(.+)/(.+)\" In a ClusterStorageContainer spec, you can specify container resource requests and limits, and a list of supported URI formats that this image supports. KServe can match the URI either with prefix or regex . Warning If a storage URI is supported by two or more ClusterStorageContainer CRs, there is no guarantee which one will be used. Please make sure that the URI format is only supported by one ClusterStorageContainer CR . Custom Protocol Example \u00b6 If you would like to use a custom protocol model-registry:// , for example, you can create a custom image and add a new ClusterStorageContainer CR to make it available to KServe. Create the Custom Storage Initializer Image \u00b6 The first step is to create a custom container image that will be injected into the KServe deployment, as init container, and that will be in charge to download the model. The only requirement is that the Entrypoint of this container image should take (and properly manage) 2 positional arguments: 1. Source URI : identifies the storageUri set in the InferenceService 2. Destination Path : the location where the model should be stored, e.g., /mnt/models Note KServe controller will take care of properly injecting your container image and invoking it with those proper arguments. A more concrete example can be found here , where the storage initializer query an existing model registry service in order to retrieve the original location of the model that the user requested to deploy. Create the ClusterStorageContainer CR \u00b6 Once the Custom Storage Initializer image is ready, you just need to create a new ClusterStorageContainer CR to make it available in the cluster. You just need to provide 2 essential information: 1. The container spec definition , this is strictly dependent on your own custom storage initializer image. 2. The supported uri formats for which your custom storage initializer should be injected, in this case just model-registry:// . kubectl kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1alpha1\" kind: ClusterStorageContainer metadata: name: abc spec: container: name: storage-initializer image: kubeflow/model-registry-storage-initializer:latest env: - name: MODEL_REGISTRY_BASE_URL value: \"$MODEL_REGISTRY_SERVICE.model-registry.svc.cluster.local:$MODEL_REGISTRY_REST_PORT\" - name: MODEL_REGISTRY_SCHEME value: \"http\" resources: requests: memory: 100Mi cpu: 100m limits: memory: 1Gi cpu: \"1\" supportedUriFormats: - prefix: model-registry:// EOF Deploy the Model with InferenceService \u00b6 Create the InferenceService with the model-registry specific URI format. kubectl kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"iris-model\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"model-registry://iris/v1\" EOF Note The only assumption here is that the ML model you are going to deploy has been already registered in the Model Registry, more information can be found in the kubeflow/model-registry repository. In this specific example the model-registry://iris/v1 model is referring to a registered model pointing to gs://kfserving-examples/models/sklearn/1.0/model . The crucial point here is that this information needs to be provided just during the registration process, whereas during every deployment action you just need to provide the model-registry specific URI that identifies that model (in this case model-registry://${MODEL_NAME}/${MODEL_VERSION} ). Spec Attributes \u00b6 Spec attributes are in API Reference doc.","title":"Storage Containers"},{"location":"modelserving/storage/storagecontainers/#storage-containers","text":"KServe downloads models using a storage initializer (initContainer). For example, this is the default storage initializer implementation . KServe introduced ClusterStorageContainer CRD in 0.11 which allows users to specify a custom container spec for a list of supported URI formats. A ClusterStorageContainer defines the container spec for one or more storage URI formats. Here is an example of a ClusterStorageContainer that corresponds to the default storage initializer. Note that this is incluced in the helm chart . apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/storage-initializer:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : gs:// - prefix : s3:// - prefix : hdfs:// - prefix : webhdfs:// - regex : \"https://(.+?).blob.core.windows.net/(.+)\" - regex : \"https://(.+?).file.core.windows.net/(.+)\" - regex : \"https?://(.+)/(.+)\" In a ClusterStorageContainer spec, you can specify container resource requests and limits, and a list of supported URI formats that this image supports. KServe can match the URI either with prefix or regex . Warning If a storage URI is supported by two or more ClusterStorageContainer CRs, there is no guarantee which one will be used. Please make sure that the URI format is only supported by one ClusterStorageContainer CR .","title":"Storage Containers"},{"location":"modelserving/storage/storagecontainers/#custom-protocol-example","text":"If you would like to use a custom protocol model-registry:// , for example, you can create a custom image and add a new ClusterStorageContainer CR to make it available to KServe.","title":"Custom Protocol Example"},{"location":"modelserving/storage/storagecontainers/#create-the-custom-storage-initializer-image","text":"The first step is to create a custom container image that will be injected into the KServe deployment, as init container, and that will be in charge to download the model. The only requirement is that the Entrypoint of this container image should take (and properly manage) 2 positional arguments: 1. Source URI : identifies the storageUri set in the InferenceService 2. Destination Path : the location where the model should be stored, e.g., /mnt/models Note KServe controller will take care of properly injecting your container image and invoking it with those proper arguments. A more concrete example can be found here , where the storage initializer query an existing model registry service in order to retrieve the original location of the model that the user requested to deploy.","title":"Create the Custom Storage Initializer Image"},{"location":"modelserving/storage/storagecontainers/#create-the-clusterstoragecontainer-cr","text":"Once the Custom Storage Initializer image is ready, you just need to create a new ClusterStorageContainer CR to make it available in the cluster. You just need to provide 2 essential information: 1. The container spec definition , this is strictly dependent on your own custom storage initializer image. 2. The supported uri formats for which your custom storage initializer should be injected, in this case just model-registry:// . kubectl kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1alpha1\" kind: ClusterStorageContainer metadata: name: abc spec: container: name: storage-initializer image: kubeflow/model-registry-storage-initializer:latest env: - name: MODEL_REGISTRY_BASE_URL value: \"$MODEL_REGISTRY_SERVICE.model-registry.svc.cluster.local:$MODEL_REGISTRY_REST_PORT\" - name: MODEL_REGISTRY_SCHEME value: \"http\" resources: requests: memory: 100Mi cpu: 100m limits: memory: 1Gi cpu: \"1\" supportedUriFormats: - prefix: model-registry:// EOF","title":"Create the ClusterStorageContainer CR"},{"location":"modelserving/storage/storagecontainers/#deploy-the-model-with-inferenceservice","text":"Create the InferenceService with the model-registry specific URI format. kubectl kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"iris-model\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"model-registry://iris/v1\" EOF Note The only assumption here is that the ML model you are going to deploy has been already registered in the Model Registry, more information can be found in the kubeflow/model-registry repository. In this specific example the model-registry://iris/v1 model is referring to a registered model pointing to gs://kfserving-examples/models/sklearn/1.0/model . The crucial point here is that this information needs to be provided just during the registration process, whereas during every deployment action you just need to provide the model-registry specific URI that identifies that model (in this case model-registry://${MODEL_NAME}/${MODEL_VERSION} ).","title":"Deploy the Model with InferenceService"},{"location":"modelserving/storage/storagecontainers/#spec-attributes","text":"Spec attributes are in API Reference doc.","title":"Spec Attributes"},{"location":"modelserving/storage/azure/azure/","text":"Deploy InferenceService with saved model on Azure \u00b6 Using Public Azure Blobs \u00b6 By default, KServe uses anonymous client to download artifacts. To point to an Azure Blob, specify StorageUri to point to an Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} e.g. https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib Using Private Blobs \u00b6 KServe supports authenticating using an Azure Service Principle. Create an authorized Azure Service Principle \u00b6 To create an Azure Service Principle follow the steps here . Assign the SP the Storage Blob Data Owner role on your blob (KServe needs this permission as it needs to list contents at the blob path to filter items to download). Details on assigning storage roles here . az ad sp create-for-rbac --name model-store-sp --role \"Storage Blob Data Owner\" \\ --scopes /subscriptions/2662a931-80ae-46f4-adc7-869c1f2bcabf/resourceGroups/cognitive/providers/Microsoft.Storage/storageAccounts/modelstoreaccount Create Azure Secret and attach to Service Account \u00b6 Create Azure secret \u00b6 yaml apiVersion : v1 kind : Secret metadata : name : azcreds type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AZ_CLIENT_ID : xxxxx AZ_CLIENT_SECRET : xxxxx AZ_SUBSCRIPTION_ID : xxxxx AZ_TENANT_ID : xxxxx Attach secret to a service account \u00b6 yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : azcreds kubectl kubectl apply -f create-azure-secret.yaml Deploy the model on Azure with InferenceService \u00b6 Create the InferenceService with the azure storageUri and the service account with azure credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa model : modelFormat : name : sklearn storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa sklearn : storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" Apply the sklearn-azure.yaml . kubectl kubectl apply -f sklearn-azure.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-azure -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-azure INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-azure:predict HTTP/1.1 > Host: sklearn-azure.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Azure"},{"location":"modelserving/storage/azure/azure/#deploy-inferenceservice-with-saved-model-on-azure","text":"","title":"Deploy InferenceService with saved model on Azure"},{"location":"modelserving/storage/azure/azure/#using-public-azure-blobs","text":"By default, KServe uses anonymous client to download artifacts. To point to an Azure Blob, specify StorageUri to point to an Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} e.g. https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib","title":"Using Public Azure Blobs"},{"location":"modelserving/storage/azure/azure/#using-private-blobs","text":"KServe supports authenticating using an Azure Service Principle.","title":"Using Private Blobs"},{"location":"modelserving/storage/azure/azure/#create-an-authorized-azure-service-principle","text":"To create an Azure Service Principle follow the steps here . Assign the SP the Storage Blob Data Owner role on your blob (KServe needs this permission as it needs to list contents at the blob path to filter items to download). Details on assigning storage roles here . az ad sp create-for-rbac --name model-store-sp --role \"Storage Blob Data Owner\" \\ --scopes /subscriptions/2662a931-80ae-46f4-adc7-869c1f2bcabf/resourceGroups/cognitive/providers/Microsoft.Storage/storageAccounts/modelstoreaccount","title":"Create an authorized Azure Service Principle"},{"location":"modelserving/storage/azure/azure/#create-azure-secret-and-attach-to-service-account","text":"","title":"Create Azure Secret and attach to Service Account"},{"location":"modelserving/storage/azure/azure/#create-azure-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : azcreds type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AZ_CLIENT_ID : xxxxx AZ_CLIENT_SECRET : xxxxx AZ_SUBSCRIPTION_ID : xxxxx AZ_TENANT_ID : xxxxx","title":"Create Azure secret"},{"location":"modelserving/storage/azure/azure/#attach-secret-to-a-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : azcreds kubectl kubectl apply -f create-azure-secret.yaml","title":"Attach secret to a service account"},{"location":"modelserving/storage/azure/azure/#deploy-the-model-on-azure-with-inferenceservice","text":"Create the InferenceService with the azure storageUri and the service account with azure credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa model : modelFormat : name : sklearn storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa sklearn : storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" Apply the sklearn-azure.yaml . kubectl kubectl apply -f sklearn-azure.yaml","title":"Deploy the model on Azure with InferenceService"},{"location":"modelserving/storage/azure/azure/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-azure -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-azure INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-azure:predict HTTP/1.1 > Host: sklearn-azure.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/gcs/gcs/","text":"Deploy InferenceService with a saved model on Google Cloud Storage (GCS) \u00b6 Using Public GCS Bucket \u00b6 If no credential is provided, anonymous client will be used to download the artifact from GCS bucket. The uri is in the following format: gs://${BUCKET_ NAME}/${PATH} e.g. gs://kfserving-examples/models/tensorflow/flowers Using Private GCS bucket \u00b6 KServe supports authenticating using Google Service Account Key Create a Service Account Key \u00b6 To create a Service Account Key follow the steps here . Base64 encode the generated Service Account Key file Create Google Secret \u00b6 Create secret \u00b6 yaml apiVersion : v1 kind : Secret metadata : name : storage-config type : Opaque stringData : gcs : | { \"type\": \"gs\", \"bucket\": \"mlpipeline\", \"base64_service_account\": \"c2VydmljZWFjY291bnQ=\" # base64 encoded value of the credential file } kubectl kubectl apply -f create-gcs-secret.yaml Deploy the model on GCS with InferenceService \u00b6 Create the InferenceService with the Google service account credential yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-gcs spec : predictor : model : modelFormat : name : sklearn storage : key : gcs path : models/tensorflow/flowers parameters : # Parameters to override the default values bucket : kfserving-examples Apply the sklearn-gcs.yaml . kubectl kubectl apply -f sklearn-gcs.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-gcs -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-gcs INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-gcs:predict HTTP/1.1 > Host: sklearn-gcs.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"GCS"},{"location":"modelserving/storage/gcs/gcs/#deploy-inferenceservice-with-a-saved-model-on-google-cloud-storage-gcs","text":"","title":"Deploy InferenceService with a saved model on Google Cloud Storage (GCS)"},{"location":"modelserving/storage/gcs/gcs/#using-public-gcs-bucket","text":"If no credential is provided, anonymous client will be used to download the artifact from GCS bucket. The uri is in the following format: gs://${BUCKET_ NAME}/${PATH} e.g. gs://kfserving-examples/models/tensorflow/flowers","title":"Using Public GCS Bucket"},{"location":"modelserving/storage/gcs/gcs/#using-private-gcs-bucket","text":"KServe supports authenticating using Google Service Account Key","title":"Using Private GCS bucket"},{"location":"modelserving/storage/gcs/gcs/#create-a-service-account-key","text":"To create a Service Account Key follow the steps here . Base64 encode the generated Service Account Key file","title":"Create a Service Account Key"},{"location":"modelserving/storage/gcs/gcs/#create-google-secret","text":"","title":"Create Google Secret"},{"location":"modelserving/storage/gcs/gcs/#create-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : storage-config type : Opaque stringData : gcs : | { \"type\": \"gs\", \"bucket\": \"mlpipeline\", \"base64_service_account\": \"c2VydmljZWFjY291bnQ=\" # base64 encoded value of the credential file } kubectl kubectl apply -f create-gcs-secret.yaml","title":"Create secret"},{"location":"modelserving/storage/gcs/gcs/#deploy-the-model-on-gcs-with-inferenceservice","text":"Create the InferenceService with the Google service account credential yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-gcs spec : predictor : model : modelFormat : name : sklearn storage : key : gcs path : models/tensorflow/flowers parameters : # Parameters to override the default values bucket : kfserving-examples Apply the sklearn-gcs.yaml . kubectl kubectl apply -f sklearn-gcs.yaml","title":"Deploy the model on GCS with InferenceService"},{"location":"modelserving/storage/gcs/gcs/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-gcs -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-gcs INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-gcs:predict HTTP/1.1 > Host: sklearn-gcs.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/pvc/pvc/","text":"Deploy InferenceService with a saved model on PVC \u00b6 This doc shows how to store a model in PVC and create InferenceService with a saved model on PVC. Create PV and PVC \u00b6 Refer to the document to create Persistent Volume (PV) and Persistent Volume Claim (PVC), the PVC will be used to store model. This document uses local PV. yaml apiVersion : v1 kind : PersistentVolume metadata : name : task-pv-volume labels : type : local spec : storageClassName : manual capacity : storage : 2Gi accessModes : - ReadWriteOnce hostPath : path : \"/home/ubuntu/mnt/data\" --- apiVersion : v1 kind : PersistentVolumeClaim metadata : name : task-pv-claim spec : storageClassName : manual accessModes : - ReadWriteOnce resources : requests : storage : 1Gi kubectl kubectl apply -f pv-and-pvc.yaml Copy model to PV \u00b6 Run pod model-store-pod and login into container model-store . yaml apiVersion : v1 kind : Pod metadata : name : model-store-pod spec : volumes : - name : model-store persistentVolumeClaim : claimName : task-pv-claim containers : - name : model-store image : ubuntu command : [ \"sleep\" ] args : [ \"infinity\" ] volumeMounts : - mountPath : \"/pv\" name : model-store resources : limits : memory : \"1Gi\" cpu : \"1\" kubectl kubectl apply -f pv-model-store.yaml kubectl exec -it model-store-pod -- bash In different terminal, copy the model from local into PV, then delete model-store-pod . kubectl kubectl cp model.joblib model-store-pod:/pv/model.joblib -c model-store kubectl delete pod model-store-pod Deploy InferenceService with models on PVC \u00b6 Update the ${PVC_NAME} to the created PVC name and create the InferenceService with the PVC storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"pvc://${PVC_NAME}/${MODEL_NAME}/\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : sklearn : storageUri : \"pvc://${PVC_NAME}/${MODEL_NAME}/\" Apply the sklearn-pvc.yaml . kubectl kubectl apply -f sklearn-pvc.yaml Note that inside the folder ${PVC_NAME}/${MODEL_NAME}/ you should have your model model.joblib . Note also that ${MODEL_NAME} is just a folder, but a good convention to keep the same name. Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-pvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-pvc INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-pvc:predict HTTP/1.1 > Host: sklearn-pvc.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"PVC"},{"location":"modelserving/storage/pvc/pvc/#deploy-inferenceservice-with-a-saved-model-on-pvc","text":"This doc shows how to store a model in PVC and create InferenceService with a saved model on PVC.","title":"Deploy InferenceService with a saved model on PVC"},{"location":"modelserving/storage/pvc/pvc/#create-pv-and-pvc","text":"Refer to the document to create Persistent Volume (PV) and Persistent Volume Claim (PVC), the PVC will be used to store model. This document uses local PV. yaml apiVersion : v1 kind : PersistentVolume metadata : name : task-pv-volume labels : type : local spec : storageClassName : manual capacity : storage : 2Gi accessModes : - ReadWriteOnce hostPath : path : \"/home/ubuntu/mnt/data\" --- apiVersion : v1 kind : PersistentVolumeClaim metadata : name : task-pv-claim spec : storageClassName : manual accessModes : - ReadWriteOnce resources : requests : storage : 1Gi kubectl kubectl apply -f pv-and-pvc.yaml","title":"Create PV and PVC"},{"location":"modelserving/storage/pvc/pvc/#copy-model-to-pv","text":"Run pod model-store-pod and login into container model-store . yaml apiVersion : v1 kind : Pod metadata : name : model-store-pod spec : volumes : - name : model-store persistentVolumeClaim : claimName : task-pv-claim containers : - name : model-store image : ubuntu command : [ \"sleep\" ] args : [ \"infinity\" ] volumeMounts : - mountPath : \"/pv\" name : model-store resources : limits : memory : \"1Gi\" cpu : \"1\" kubectl kubectl apply -f pv-model-store.yaml kubectl exec -it model-store-pod -- bash In different terminal, copy the model from local into PV, then delete model-store-pod . kubectl kubectl cp model.joblib model-store-pod:/pv/model.joblib -c model-store kubectl delete pod model-store-pod","title":"Copy model to PV"},{"location":"modelserving/storage/pvc/pvc/#deploy-inferenceservice-with-models-on-pvc","text":"Update the ${PVC_NAME} to the created PVC name and create the InferenceService with the PVC storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"pvc://${PVC_NAME}/${MODEL_NAME}/\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : sklearn : storageUri : \"pvc://${PVC_NAME}/${MODEL_NAME}/\" Apply the sklearn-pvc.yaml . kubectl kubectl apply -f sklearn-pvc.yaml Note that inside the folder ${PVC_NAME}/${MODEL_NAME}/ you should have your model model.joblib . Note also that ${MODEL_NAME} is just a folder, but a good convention to keep the same name.","title":"Deploy InferenceService with models on PVC"},{"location":"modelserving/storage/pvc/pvc/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-pvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-pvc INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-pvc:predict HTTP/1.1 > Host: sklearn-pvc.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/s3/s3/","text":"Deploy InferenceService with a saved model on S3 \u00b6 There are two supported methods for configuring credentials for AWS S3 storage: AWS IAM Role for Service Account (Recommended) AWS IAM User Credentials Global configuration options for S3 credentials can be found in the inferenceservice configmap, and will be used as a backup if the relevant annotations aren't found on the secret or service account. Create Service Account with IAM Role \u00b6 Create an IAM Role and configure according to the AWS Documentation . KServe will read the annotations on the Service Acccount in order to inject the proper environment variables on the storage initializer container. Create Service Account \u00b6 yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa annotations : eks.amazonaws.com/role-arn : arn:aws:iam::123456789012:role/s3access # replace with your IAM role ARN serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials kubectl kubectl apply -f create-s3-sa.yaml Create S3 Secret and attach to Service Account \u00b6 Create a secret with your S3 user credential , KServe reads the secret annotations to inject the S3 environment variables on storage initializer or model agent to download the models from S3 storage. Create S3 secret \u00b6 yaml apiVersion : v1 kind : Secret metadata : name : s3creds annotations : serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AWS_ACCESS_KEY_ID : XXXX AWS_SECRET_ACCESS_KEY : XXXXXXXX Attach secret to a service account \u00b6 yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : s3creds kubectl kubectl apply -f create-s3-secret.yaml Note If you are running kserve with istio sidecars enabled, there can be a race condition between the istio proxy being ready and the agent pulling models. This will result in a tcp dial connection refused error when the agent tries to download from s3. To resolve it, istio allows the blocking of other containers in a pod until the proxy container is ready. You can enabled this by setting proxy.holdApplicationUntilProxyStarts: true in istio-sidecar-injector configmap, proxy.holdApplicationUntilProxyStarts flag was introduced in Istio 1.7 as an experimental feature and is turned off by default. Deploy the model on S3 with InferenceService \u00b6 Create the InferenceService with the s3 storageUri and the service account with s3 credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa model : modelFormat : name : tensorflow storageUri : \"s3://kserve-examples/mnist\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa tensorflow : storageUri : \"s3://kserve-examples/mnist\" Apply the autoscale-gpu.yaml . kubectl kubectl apply -f mnist-s3.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice mnist-s3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = mnist-s3 INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output Note: Unnecessary use of -X or --request, POST is already inferred. * Trying 35 .237.217.209... * TCP_NODELAY set * Connected to mnist-s3.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/mnist-s3:predict HTTP/1.1 > Host: mnist-s3.default.35.237.217.209.xip.io > User-Agent: curl/7.55.1 > Accept: */* > Content-Length: 2052 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 251 < content-type: application/json < date: Sun, 04 Apr 2021 20 :06:27 GMT < x-envoy-upstream-service-time: 5 < server: istio-envoy < * Connection #0 to host mnist-s3.default.35.237.217.209.xip.io left intact { \"predictions\" : [ { \"predictions\" : [ 0 .327352405, 2 .00153053e-07, 0 .0113353515, 0 .203903764, 3 .62863029e-05, 0 .416683704, 0 .000281196437, 8 .36911859e-05, 0 .0403052084, 1 .82206513e-05 ] , \"classes\" : 5 } ] }","title":"S3"},{"location":"modelserving/storage/s3/s3/#deploy-inferenceservice-with-a-saved-model-on-s3","text":"There are two supported methods for configuring credentials for AWS S3 storage: AWS IAM Role for Service Account (Recommended) AWS IAM User Credentials Global configuration options for S3 credentials can be found in the inferenceservice configmap, and will be used as a backup if the relevant annotations aren't found on the secret or service account.","title":"Deploy InferenceService with a saved model on S3"},{"location":"modelserving/storage/s3/s3/#create-service-account-with-iam-role","text":"Create an IAM Role and configure according to the AWS Documentation . KServe will read the annotations on the Service Acccount in order to inject the proper environment variables on the storage initializer container.","title":"Create Service Account with IAM Role"},{"location":"modelserving/storage/s3/s3/#create-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa annotations : eks.amazonaws.com/role-arn : arn:aws:iam::123456789012:role/s3access # replace with your IAM role ARN serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials kubectl kubectl apply -f create-s3-sa.yaml","title":"Create Service Account"},{"location":"modelserving/storage/s3/s3/#create-s3-secret-and-attach-to-service-account","text":"Create a secret with your S3 user credential , KServe reads the secret annotations to inject the S3 environment variables on storage initializer or model agent to download the models from S3 storage.","title":"Create S3 Secret and attach to Service Account"},{"location":"modelserving/storage/s3/s3/#create-s3-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : s3creds annotations : serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AWS_ACCESS_KEY_ID : XXXX AWS_SECRET_ACCESS_KEY : XXXXXXXX","title":"Create S3 secret"},{"location":"modelserving/storage/s3/s3/#attach-secret-to-a-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : s3creds kubectl kubectl apply -f create-s3-secret.yaml Note If you are running kserve with istio sidecars enabled, there can be a race condition between the istio proxy being ready and the agent pulling models. This will result in a tcp dial connection refused error when the agent tries to download from s3. To resolve it, istio allows the blocking of other containers in a pod until the proxy container is ready. You can enabled this by setting proxy.holdApplicationUntilProxyStarts: true in istio-sidecar-injector configmap, proxy.holdApplicationUntilProxyStarts flag was introduced in Istio 1.7 as an experimental feature and is turned off by default.","title":"Attach secret to a service account"},{"location":"modelserving/storage/s3/s3/#deploy-the-model-on-s3-with-inferenceservice","text":"Create the InferenceService with the s3 storageUri and the service account with s3 credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa model : modelFormat : name : tensorflow storageUri : \"s3://kserve-examples/mnist\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa tensorflow : storageUri : \"s3://kserve-examples/mnist\" Apply the autoscale-gpu.yaml . kubectl kubectl apply -f mnist-s3.yaml","title":"Deploy the model on S3 with InferenceService"},{"location":"modelserving/storage/s3/s3/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice mnist-s3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = mnist-s3 INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output Note: Unnecessary use of -X or --request, POST is already inferred. * Trying 35 .237.217.209... * TCP_NODELAY set * Connected to mnist-s3.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/mnist-s3:predict HTTP/1.1 > Host: mnist-s3.default.35.237.217.209.xip.io > User-Agent: curl/7.55.1 > Accept: */* > Content-Length: 2052 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 251 < content-type: application/json < date: Sun, 04 Apr 2021 20 :06:27 GMT < x-envoy-upstream-service-time: 5 < server: istio-envoy < * Connection #0 to host mnist-s3.default.35.237.217.209.xip.io left intact { \"predictions\" : [ { \"predictions\" : [ 0 .327352405, 2 .00153053e-07, 0 .0113353515, 0 .203903764, 3 .62863029e-05, 0 .416683704, 0 .000281196437, 8 .36911859e-05, 0 .0403052084, 1 .82206513e-05 ] , \"classes\" : 5 } ] }","title":"Run a prediction"},{"location":"modelserving/storage/uri/uri/","text":"Deploy InferenceService with a saved model from a URI \u00b6 This doc guides to specify a model object via the URI (Uniform Resource Identifier) of the model object exposed via an http or https endpoint. This storageUri option supports single file models, like sklearn which is specified by a joblib file, or artifacts (e.g. tar or zip ) which contain all the necessary dependencies for other model types (e.g. tensorflow or pytorch ). Here, we'll show examples from both of the above. Create HTTP/HTTPS header Secret and attach to Service account \u00b6 The HTTP/HTTPS service request headers can be defined as secret and attached to service account. This is optional. yaml apiVersion : v1 kind : Secret metadata : name : mysecret type : Opaque data : https-host : ZXhhbXBsZS5jb20= headers : |- ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 --- apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : mysecret kubectl kubectl apply -f create-uri-secret.yaml Note The serviceAccountName specified in your predictor in your inference service. These headers will be applied to any http/https requests that have the same host. The header and host should be base64 encoded format. example.com # echo -n \"example.com\" | base64 ZXhhbXBsZS5jb20= --- { \"account-name\": \"some_account_name\", \"secret-key\": \"some_secret_key\" } # echo -n '{\\n\"account-name\": \"some_account_name\",\\n\"secret-key\": \"some_secret_key\"\\n}' | base64 ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 Sklearn \u00b6 Train and freeze the model \u00b6 Here, we'll train a simple iris model. Please note that KServe requires sklearn==0.20.3 . python from sklearn import svm from sklearn import datasets import joblib def train ( X , y ): clf = svm . SVC ( gamma = 'auto' ) clf . fit ( X , y ) return clf def freeze ( clf , path = '../frozen' ): joblib . dump ( clf , f ' { path } /model.joblib' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , y = iris . data , iris . target clf = train ( X , y ) freeze ( clf ) Now, the frozen model object can be put it somewhere on the web to expose it. For instance, pushing the model.joblib file to some repo on GitHub. Specify and create the InferenceService \u00b6 New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : sklearn : storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true Apply the sklearn-from-uri.yaml . kubectl kubectl apply -f sklearn-from-uri.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-from-uri -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-from-uri INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-from-uri:predict HTTP/1.1 > Host: sklearn-from-uri.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 06 Sep 2021 15 :52:55 GMT < server: istio-envoy < x-envoy-upstream-service-time: 7 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]} Tensorflow \u00b6 This will serve as an example of the ability to also pull in a tarball containing all of the required model dependencies, for instance tensorflow requires multiple files in a strict directory structure in order to be servable. Train and freeze the model \u00b6 python from sklearn import datasets import numpy as np import tensorflow as tf def _ohe ( targets ): y = np . zeros (( 150 , 3 )) for i , label in enumerate ( targets ): y [ i , label ] = 1.0 return y def train ( X , y , epochs , batch_size = 16 ): model = tf . keras . Sequential ([ tf . keras . layers . InputLayer ( input_shape = ( 4 ,)), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 3 , activation = 'softmax' ) ]) model . compile ( tf . keras . optimizers . RMSprop ( learning_rate = 0.001 ), loss = 'categorical_crossentropy' , metrics = [ 'accuracy' ]) model . fit ( X , y , epochs = epochs ) return model def freeze ( model , path = '../frozen' ): model . save ( f ' { path } /0001' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , targets = iris . data , iris . target y = _ohe ( targets ) model = train ( X , y , epochs = 50 ) freeze ( model ) The post-training procedure here is a bit different. Instead of directly pushing the frozen output to some URI, we'll need to package them into a tarball. To do so, cd ../frozen tar -cvf artifacts.tar 0001 / gzip < artifacts.tar > artifacts.tgz Where we assume the 0001/ directory has the structure: |-- 0001/ |-- saved_model.pb |-- variables/ |--- variables.data-00000-of-00001 |--- variables.index Note Building the tarball from the directory specifying a version number is required for tensorflow . Specify and create the InferenceService \u00b6 And again, if everything went to plan we should be able to pull down the tarball and expose the endpoint. yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : tensorflow-from-uri-gzip spec : predictor : tensorflow : storageUri : https://raw.githubusercontent.com/tduffy000/kfserving-uri-examples/master/tensorflow/frozen/model_artifacts.tar.gz kubectl kubectl apply -f tensorflow-from-uri-gzip.yaml Run a prediction \u00b6 Again, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice tensorflow-from-uri-gzip -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = tensorflow-from-uri-gzip INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 10 .0.1.16... * TCP_NODELAY set % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 * Connected to 10 .0.1.16 ( 10 .0.1.16 ) port 30749 ( #0) > POST /v1/models/tensorflow-from-uri:predict HTTP/1.1 > Host: tensorflow-from-uri.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 86 > Content-Type: application/x-www-form-urlencoded > } [ 86 bytes data ] * upload completely sent off: 86 out of 86 bytes < HTTP/1.1 200 OK < content-length: 112 < content-type: application/json < date: Thu, 06 Aug 2020 23 :21:19 GMT < x-envoy-upstream-service-time: 151 < server: istio-envoy < { [ 112 bytes data ] 100 198 100 112 100 86 722 554 --:--:-- --:--:-- --:--:-- 1285 * Connection #0 to host 10.0.1.16 left intact { \"predictions\" : [ [ 0 .0204100646, 0 .680984616, 0 .298605353 ] , [ 0 .0296604875, 0 .658412039, 0 .311927497 ] ] }","title":"URI"},{"location":"modelserving/storage/uri/uri/#deploy-inferenceservice-with-a-saved-model-from-a-uri","text":"This doc guides to specify a model object via the URI (Uniform Resource Identifier) of the model object exposed via an http or https endpoint. This storageUri option supports single file models, like sklearn which is specified by a joblib file, or artifacts (e.g. tar or zip ) which contain all the necessary dependencies for other model types (e.g. tensorflow or pytorch ). Here, we'll show examples from both of the above.","title":"Deploy InferenceService with a saved model from a URI"},{"location":"modelserving/storage/uri/uri/#create-httphttps-header-secret-and-attach-to-service-account","text":"The HTTP/HTTPS service request headers can be defined as secret and attached to service account. This is optional. yaml apiVersion : v1 kind : Secret metadata : name : mysecret type : Opaque data : https-host : ZXhhbXBsZS5jb20= headers : |- ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 --- apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : mysecret kubectl kubectl apply -f create-uri-secret.yaml Note The serviceAccountName specified in your predictor in your inference service. These headers will be applied to any http/https requests that have the same host. The header and host should be base64 encoded format. example.com # echo -n \"example.com\" | base64 ZXhhbXBsZS5jb20= --- { \"account-name\": \"some_account_name\", \"secret-key\": \"some_secret_key\" } # echo -n '{\\n\"account-name\": \"some_account_name\",\\n\"secret-key\": \"some_secret_key\"\\n}' | base64 ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9","title":"Create HTTP/HTTPS header Secret and attach to Service account"},{"location":"modelserving/storage/uri/uri/#sklearn","text":"","title":"Sklearn"},{"location":"modelserving/storage/uri/uri/#train-and-freeze-the-model","text":"Here, we'll train a simple iris model. Please note that KServe requires sklearn==0.20.3 . python from sklearn import svm from sklearn import datasets import joblib def train ( X , y ): clf = svm . SVC ( gamma = 'auto' ) clf . fit ( X , y ) return clf def freeze ( clf , path = '../frozen' ): joblib . dump ( clf , f ' { path } /model.joblib' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , y = iris . data , iris . target clf = train ( X , y ) freeze ( clf ) Now, the frozen model object can be put it somewhere on the web to expose it. For instance, pushing the model.joblib file to some repo on GitHub.","title":"Train and freeze the model"},{"location":"modelserving/storage/uri/uri/#specify-and-create-the-inferenceservice","text":"New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : sklearn : storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true Apply the sklearn-from-uri.yaml . kubectl kubectl apply -f sklearn-from-uri.yaml","title":"Specify and create the InferenceService"},{"location":"modelserving/storage/uri/uri/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-from-uri -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-from-uri INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-from-uri:predict HTTP/1.1 > Host: sklearn-from-uri.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 06 Sep 2021 15 :52:55 GMT < server: istio-envoy < x-envoy-upstream-service-time: 7 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/uri/uri/#tensorflow","text":"This will serve as an example of the ability to also pull in a tarball containing all of the required model dependencies, for instance tensorflow requires multiple files in a strict directory structure in order to be servable.","title":"Tensorflow"},{"location":"modelserving/storage/uri/uri/#train-and-freeze-the-model_1","text":"python from sklearn import datasets import numpy as np import tensorflow as tf def _ohe ( targets ): y = np . zeros (( 150 , 3 )) for i , label in enumerate ( targets ): y [ i , label ] = 1.0 return y def train ( X , y , epochs , batch_size = 16 ): model = tf . keras . Sequential ([ tf . keras . layers . InputLayer ( input_shape = ( 4 ,)), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 3 , activation = 'softmax' ) ]) model . compile ( tf . keras . optimizers . RMSprop ( learning_rate = 0.001 ), loss = 'categorical_crossentropy' , metrics = [ 'accuracy' ]) model . fit ( X , y , epochs = epochs ) return model def freeze ( model , path = '../frozen' ): model . save ( f ' { path } /0001' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , targets = iris . data , iris . target y = _ohe ( targets ) model = train ( X , y , epochs = 50 ) freeze ( model ) The post-training procedure here is a bit different. Instead of directly pushing the frozen output to some URI, we'll need to package them into a tarball. To do so, cd ../frozen tar -cvf artifacts.tar 0001 / gzip < artifacts.tar > artifacts.tgz Where we assume the 0001/ directory has the structure: |-- 0001/ |-- saved_model.pb |-- variables/ |--- variables.data-00000-of-00001 |--- variables.index Note Building the tarball from the directory specifying a version number is required for tensorflow .","title":"Train and freeze the model"},{"location":"modelserving/storage/uri/uri/#specify-and-create-the-inferenceservice_1","text":"And again, if everything went to plan we should be able to pull down the tarball and expose the endpoint. yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : tensorflow-from-uri-gzip spec : predictor : tensorflow : storageUri : https://raw.githubusercontent.com/tduffy000/kfserving-uri-examples/master/tensorflow/frozen/model_artifacts.tar.gz kubectl kubectl apply -f tensorflow-from-uri-gzip.yaml","title":"Specify and create the InferenceService"},{"location":"modelserving/storage/uri/uri/#run-a-prediction_1","text":"Again, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice tensorflow-from-uri-gzip -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = tensorflow-from-uri-gzip INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 10 .0.1.16... * TCP_NODELAY set % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 * Connected to 10 .0.1.16 ( 10 .0.1.16 ) port 30749 ( #0) > POST /v1/models/tensorflow-from-uri:predict HTTP/1.1 > Host: tensorflow-from-uri.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 86 > Content-Type: application/x-www-form-urlencoded > } [ 86 bytes data ] * upload completely sent off: 86 out of 86 bytes < HTTP/1.1 200 OK < content-length: 112 < content-type: application/json < date: Thu, 06 Aug 2020 23 :21:19 GMT < x-envoy-upstream-service-time: 151 < server: istio-envoy < { [ 112 bytes data ] 100 198 100 112 100 86 722 554 --:--:-- --:--:-- --:--:-- 1285 * Connection #0 to host 10.0.1.16 left intact { \"predictions\" : [ [ 0 .0204100646, 0 .680984616, 0 .298605353 ] , [ 0 .0296604875, 0 .658412039, 0 .311927497 ] ] }","title":"Run a prediction"},{"location":"modelserving/v1beta1/serving_runtime/","text":"Model Serving Runtimes \u00b6 KServe provides a simple Kubernetes CRD to enable deploying single or multiple trained models onto model serving runtimes such as TFServing , TorchServe , Triton Inference Server . In addition ModelServer is the Python model serving runtime implemented in KServe itself with prediction v1 protocol, MLServer implements the prediction v2 protocol with both REST and gRPC. These model serving runtimes are able to provide out-of-the-box model serving, but you could also choose to build your own model server for more complex use case. KServe provides basic API primitives to allow you easily build custom model serving runtime, you can use other tools like BentoML to build your custom model serving image. After models are deployed with InferenceService, you get all the following serverless features provided by KServe. Scale to and from Zero Request based Autoscaling on CPU/GPU Revision Management Optimized Container Batching Request/Response logging Traffic management Security with AuthN/AuthZ Distributed Tracing Out-of-the-box metrics Ingress/Egress control The table below identifies each of the model serving runtimes supported by KServe. The HTTP and gRPC columns indicate the prediction protocol version that the serving runtime supports. The KServe prediction protocol is noted as either \"v1\" or \"v2\". Some serving runtimes also support their own prediction protocol, these are noted with an * . The default serving runtime version column defines the source and version of the serving runtime - MLServer, KServe or its own. These versions can also be found in the runtime kustomization YAML . All KServe native model serving runtimes use the current KServe release version (v0.12). The supported framework version column lists the major version of the model that is supported. These can also be found in the respective runtime YAML under the supportedModelFormats field. For model frameworks using the KServe serving runtime, the specific default version can be found in kserve/python . In a given serving runtime directory the pyproject.toml file contains the exact model framework version used. For example, in kserve/python/lgbserver the pyproject.toml file sets the model framework version to 3.3.2, lightgbm ~= 3.3.2 . Model Serving Runtime Exported model HTTP gRPC Default Serving Runtime Version Supported Framework (Major) Version(s) Examples Custom ModelServer -- v1, v2 v2 -- -- Custom Model LightGBM MLServer Saved LightGBM Model v2 v2 v1.3.2 (MLServer) 3 LightGBM Iris V2 LightGBM ModelServer Saved LightGBM Model v1, v2 v2 v0.12 (KServe) 3 LightGBM Iris MLFlow ModelServer Saved MLFlow Model v2 v2 v1.3.2 (MLServer) 1 MLFLow wine-classifier PMML ModelServer PMML v1, v2 v2 v0.12 (KServe) 3, 4 ( PMML4.4.1 ) SKLearn PMML SKLearn MLServer Pickled Model v2 v2 v1.3.2 (MLServer) 1 SKLearn Iris V2 SKLearn ModelServer Pickled Model v1, v2 v2 v0.12 (KServe) 1.3 SKLearn Iris TFServing TensorFlow SavedModel v1 *tensorflow 2.6.2 ( TFServing Versions ) 2 TensorFlow flower TorchServe Eager Model/TorchScript v1, v2, *torchserve *torchserve 0.8.2 (TorchServe) 2 TorchServe mnist Triton Inference Server TensorFlow,TorchScript,ONNX v2 v2 23.05-py3 (Triton) 8 (TensoRT), 1, 2 (TensorFlow), 2 (PyTorch), 2 (Triton) Compatibility Matrix Torchscript cifar XGBoost MLServer Saved Model v2 v2 v1.3.2 (MLServer) 1 XGBoost Iris V2 XGBoost ModelServer Saved Model v1, v2 v2 v0.12 (KServe) 1 XGBoost Iris HuggingFace ModelServer Saved Model / Huggingface Hub Model_Id v1, v2 -- v0.12 (KServe) 4 ( Transformers ) -- HuggingFace VLLM ModelServer Saved Model / Huggingface Hub Model_Id v2 -- v0.12 (KServe) 0 ( Vllm ) -- *tensorflow - Tensorflow implements its own prediction protocol in addition to KServe's. See: Tensorflow Serving Prediction API documentation *torchserve - PyTorch implements its own prediction protocol in addition to KServe's. See: Torchserve gRPC API documentation Note The model serving runtime version can be overwritten with the runtimeVersion field on InferenceService yaml and we highly recommend setting this field for production services. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3","title":"Overview"},{"location":"modelserving/v1beta1/serving_runtime/#model-serving-runtimes","text":"KServe provides a simple Kubernetes CRD to enable deploying single or multiple trained models onto model serving runtimes such as TFServing , TorchServe , Triton Inference Server . In addition ModelServer is the Python model serving runtime implemented in KServe itself with prediction v1 protocol, MLServer implements the prediction v2 protocol with both REST and gRPC. These model serving runtimes are able to provide out-of-the-box model serving, but you could also choose to build your own model server for more complex use case. KServe provides basic API primitives to allow you easily build custom model serving runtime, you can use other tools like BentoML to build your custom model serving image. After models are deployed with InferenceService, you get all the following serverless features provided by KServe. Scale to and from Zero Request based Autoscaling on CPU/GPU Revision Management Optimized Container Batching Request/Response logging Traffic management Security with AuthN/AuthZ Distributed Tracing Out-of-the-box metrics Ingress/Egress control The table below identifies each of the model serving runtimes supported by KServe. The HTTP and gRPC columns indicate the prediction protocol version that the serving runtime supports. The KServe prediction protocol is noted as either \"v1\" or \"v2\". Some serving runtimes also support their own prediction protocol, these are noted with an * . The default serving runtime version column defines the source and version of the serving runtime - MLServer, KServe or its own. These versions can also be found in the runtime kustomization YAML . All KServe native model serving runtimes use the current KServe release version (v0.12). The supported framework version column lists the major version of the model that is supported. These can also be found in the respective runtime YAML under the supportedModelFormats field. For model frameworks using the KServe serving runtime, the specific default version can be found in kserve/python . In a given serving runtime directory the pyproject.toml file contains the exact model framework version used. For example, in kserve/python/lgbserver the pyproject.toml file sets the model framework version to 3.3.2, lightgbm ~= 3.3.2 . Model Serving Runtime Exported model HTTP gRPC Default Serving Runtime Version Supported Framework (Major) Version(s) Examples Custom ModelServer -- v1, v2 v2 -- -- Custom Model LightGBM MLServer Saved LightGBM Model v2 v2 v1.3.2 (MLServer) 3 LightGBM Iris V2 LightGBM ModelServer Saved LightGBM Model v1, v2 v2 v0.12 (KServe) 3 LightGBM Iris MLFlow ModelServer Saved MLFlow Model v2 v2 v1.3.2 (MLServer) 1 MLFLow wine-classifier PMML ModelServer PMML v1, v2 v2 v0.12 (KServe) 3, 4 ( PMML4.4.1 ) SKLearn PMML SKLearn MLServer Pickled Model v2 v2 v1.3.2 (MLServer) 1 SKLearn Iris V2 SKLearn ModelServer Pickled Model v1, v2 v2 v0.12 (KServe) 1.3 SKLearn Iris TFServing TensorFlow SavedModel v1 *tensorflow 2.6.2 ( TFServing Versions ) 2 TensorFlow flower TorchServe Eager Model/TorchScript v1, v2, *torchserve *torchserve 0.8.2 (TorchServe) 2 TorchServe mnist Triton Inference Server TensorFlow,TorchScript,ONNX v2 v2 23.05-py3 (Triton) 8 (TensoRT), 1, 2 (TensorFlow), 2 (PyTorch), 2 (Triton) Compatibility Matrix Torchscript cifar XGBoost MLServer Saved Model v2 v2 v1.3.2 (MLServer) 1 XGBoost Iris V2 XGBoost ModelServer Saved Model v1, v2 v2 v0.12 (KServe) 1 XGBoost Iris HuggingFace ModelServer Saved Model / Huggingface Hub Model_Id v1, v2 -- v0.12 (KServe) 4 ( Transformers ) -- HuggingFace VLLM ModelServer Saved Model / Huggingface Hub Model_Id v2 -- v0.12 (KServe) 0 ( Vllm ) -- *tensorflow - Tensorflow implements its own prediction protocol in addition to KServe's. See: Tensorflow Serving Prediction API documentation *torchserve - PyTorch implements its own prediction protocol in addition to KServe's. See: Torchserve gRPC API documentation Note The model serving runtime version can be overwritten with the runtimeVersion field on InferenceService yaml and we highly recommend setting this field for production services. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3","title":"Model Serving Runtimes"},{"location":"modelserving/v1beta1/amd/","text":"AMD Inference Server \u00b6 The AMD Inference Server is an easy-to-use inferencing solution specially designed for AMD CPUs, GPUs, and FPGAs. It can be deployed as a standalone executable or on a Kubernetes cluster with KServe or used to create custom applications by linking to its C++ API. This example demonstrates how to deploy a Tensorflow GraphDef model on KServe with the AMD Inference Server to run inference on AMD EPYC CPUs . Prerequisites \u00b6 This example was tested on an Ubuntu 18.04 host machine using the Bash shell. These instructions assume: You have a machine with a modern version of Docker (>=18.09) and sufficient disk space to build the image You have a Kubernetes cluster set up KServe has been installed on the Kubernetes cluster Some familiarity with Kubernetes / KServe Refer to the installation instructions for these tools to install them if needed. Set up the image \u00b6 This example uses the AMD ZenDNN backend to run inference on TensorFlow models on AMD EPYC CPUs. Build the image \u00b6 To build a Docker image for the AMD Inference Server that uses this backend, download the TF_v2.9_ZenDNN_v3.3_C++_API.zip package from ZenDNN. You must agree to the EULA to download this package. You need a modern version of Docker (at least 18.09) to build this image. # clone the inference server repository git clone https://github.com/Xilinx/inference-server.git # place the downloaded ZenDNN zip in the repository mv TF_v2.9_ZenDNN_v3.3_C++_API.zip ./inference-server/ # build the image cd inference-server ./amdinfer dockerize --production --tfzendnn = ./TF_v2.9_ZenDNN_v3.3_C++_API.zip This builds an image on your host: <username>/amdinfer:latest . To use with KServe, you need to upload this image to a Docker registry server such as on a local server . You will also need to update the YAML files in this example to use this image. More documentation for building a ZenDNN image for KServe is available: ZenDNN + AMD Inference Server and KServe + AMD Inference Server . Set up the model \u00b6 In this example, you will use an MNIST Tensorflow model . The AMD Inference Server also supports PyTorch, ONNX and Vitis AI models models with the appropriate Docker images. To prepare new models, look at the KServe + AMD Inference Server documentation for more information about the expected model format. Make an inference \u00b6 The AMD Inference Server can be used in single model serving mode in KServe. The code snippets below use the environment variables INGRESS_HOST and INGRESS_PORT to make requests to the cluster. Find the ingress host and port for making requests to your cluster and set these values appropriately. Add the ClusterServingRuntime \u00b6 To use the AMD Inference Server with KServe, add it as a serving runtime . A ClusterServingRuntime configuration file is included in this example. To apply it: # update the kserve-amdserver.yaml to use the right image # if you have a different image name, you'll need to edit it manually sed -i \"s/<image>/ $( whoami ) \\/amdinfer:latest/\" kserve-amdserver.yaml kubectl apply -f kserve-amdserver.yaml Single model serving \u00b6 Once the AMD Inference Server has been added as a serving runtime, you can start a service that uses it. # download the inference service file and input data curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/single_model.yaml curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/input.json # create the inference service kubectl apply -f single_model.yaml # wait for service to be ready kubectl wait --for = condition = ready isvc -l app = example-amdserver-runtime-isvc export SERVICE_HOSTNAME = $( kubectl get inferenceservice example-amdserver-runtime-isvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Make a request with REST \u00b6 Once the service is ready, you can make requests to it. Assuming that INGRESS_HOST , INGRESS_PORT , and SERVICE_HOSTNAME have been defined as above, the following command runs an inference over REST to the example MNIST model. export MODEL_NAME = mnist export INPUT_DATA = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d ${ INPUT_DATA } This shows the response from the server in KServe's v2 API format. For this example, it will be similar to: Expected Output { \"id\" : \"\" , \"model_name\" : \"TFModel\" , \"outputs\" : [ { \"data\" : [ 0.11987821012735367 , 0.18648317456245422 , -0.83796119689941406 , -0.088459312915802002 , 0.030454874038696289 , 0.074872657656669617 , -1.1334009170532227 , -0.046301722526550293 , -0.31683838367462158 , 0.32014602422714233 ], \"datatype\" : \"FP32\" , \"name\" : \"input-0\" , \"parameters\" :{}, \"shape\" :[ 10 ] } ] } For MNIST, the data indicates the likely classification for the input image, which is the number 9. In this response, the index with the highest value is the last one, indicating that the image was correctly classified as nine.","title":"AMD"},{"location":"modelserving/v1beta1/amd/#amd-inference-server","text":"The AMD Inference Server is an easy-to-use inferencing solution specially designed for AMD CPUs, GPUs, and FPGAs. It can be deployed as a standalone executable or on a Kubernetes cluster with KServe or used to create custom applications by linking to its C++ API. This example demonstrates how to deploy a Tensorflow GraphDef model on KServe with the AMD Inference Server to run inference on AMD EPYC CPUs .","title":"AMD Inference Server"},{"location":"modelserving/v1beta1/amd/#prerequisites","text":"This example was tested on an Ubuntu 18.04 host machine using the Bash shell. These instructions assume: You have a machine with a modern version of Docker (>=18.09) and sufficient disk space to build the image You have a Kubernetes cluster set up KServe has been installed on the Kubernetes cluster Some familiarity with Kubernetes / KServe Refer to the installation instructions for these tools to install them if needed.","title":"Prerequisites"},{"location":"modelserving/v1beta1/amd/#set-up-the-image","text":"This example uses the AMD ZenDNN backend to run inference on TensorFlow models on AMD EPYC CPUs.","title":"Set up the image"},{"location":"modelserving/v1beta1/amd/#build-the-image","text":"To build a Docker image for the AMD Inference Server that uses this backend, download the TF_v2.9_ZenDNN_v3.3_C++_API.zip package from ZenDNN. You must agree to the EULA to download this package. You need a modern version of Docker (at least 18.09) to build this image. # clone the inference server repository git clone https://github.com/Xilinx/inference-server.git # place the downloaded ZenDNN zip in the repository mv TF_v2.9_ZenDNN_v3.3_C++_API.zip ./inference-server/ # build the image cd inference-server ./amdinfer dockerize --production --tfzendnn = ./TF_v2.9_ZenDNN_v3.3_C++_API.zip This builds an image on your host: <username>/amdinfer:latest . To use with KServe, you need to upload this image to a Docker registry server such as on a local server . You will also need to update the YAML files in this example to use this image. More documentation for building a ZenDNN image for KServe is available: ZenDNN + AMD Inference Server and KServe + AMD Inference Server .","title":"Build the image"},{"location":"modelserving/v1beta1/amd/#set-up-the-model","text":"In this example, you will use an MNIST Tensorflow model . The AMD Inference Server also supports PyTorch, ONNX and Vitis AI models models with the appropriate Docker images. To prepare new models, look at the KServe + AMD Inference Server documentation for more information about the expected model format.","title":"Set up the model"},{"location":"modelserving/v1beta1/amd/#make-an-inference","text":"The AMD Inference Server can be used in single model serving mode in KServe. The code snippets below use the environment variables INGRESS_HOST and INGRESS_PORT to make requests to the cluster. Find the ingress host and port for making requests to your cluster and set these values appropriately.","title":"Make an inference"},{"location":"modelserving/v1beta1/amd/#add-the-clusterservingruntime","text":"To use the AMD Inference Server with KServe, add it as a serving runtime . A ClusterServingRuntime configuration file is included in this example. To apply it: # update the kserve-amdserver.yaml to use the right image # if you have a different image name, you'll need to edit it manually sed -i \"s/<image>/ $( whoami ) \\/amdinfer:latest/\" kserve-amdserver.yaml kubectl apply -f kserve-amdserver.yaml","title":"Add the ClusterServingRuntime"},{"location":"modelserving/v1beta1/amd/#single-model-serving","text":"Once the AMD Inference Server has been added as a serving runtime, you can start a service that uses it. # download the inference service file and input data curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/single_model.yaml curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/input.json # create the inference service kubectl apply -f single_model.yaml # wait for service to be ready kubectl wait --for = condition = ready isvc -l app = example-amdserver-runtime-isvc export SERVICE_HOSTNAME = $( kubectl get inferenceservice example-amdserver-runtime-isvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Single model serving"},{"location":"modelserving/v1beta1/amd/#make-a-request-with-rest","text":"Once the service is ready, you can make requests to it. Assuming that INGRESS_HOST , INGRESS_PORT , and SERVICE_HOSTNAME have been defined as above, the following command runs an inference over REST to the example MNIST model. export MODEL_NAME = mnist export INPUT_DATA = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d ${ INPUT_DATA } This shows the response from the server in KServe's v2 API format. For this example, it will be similar to: Expected Output { \"id\" : \"\" , \"model_name\" : \"TFModel\" , \"outputs\" : [ { \"data\" : [ 0.11987821012735367 , 0.18648317456245422 , -0.83796119689941406 , -0.088459312915802002 , 0.030454874038696289 , 0.074872657656669617 , -1.1334009170532227 , -0.046301722526550293 , -0.31683838367462158 , 0.32014602422714233 ], \"datatype\" : \"FP32\" , \"name\" : \"input-0\" , \"parameters\" :{}, \"shape\" :[ 10 ] } ] } For MNIST, the data indicates the likely classification for the input image, which is the number 9. In this response, the index with the highest value is the last one, indicating that the image was correctly classified as nine.","title":"Make a request with REST"},{"location":"modelserving/v1beta1/custom/custom_model/","text":"Deploy Custom Python Serving Runtime with InferenceService \u00b6 When the out-of-the-box Serving Runtime does not fit your need, you can choose to build your own model server using KServe ModelServer API to deploy as Custom Serving Runtime on KServe. Create and Deploy Custom REST ServingRuntime \u00b6 Setup \u00b6 Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository. Implement Custom Model using KServe API \u00b6 KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, the predictor handler executes the inference for your model, the postprocess handler then turns the raw prediction result into user-friendly inference response. There is an additional load handler which is used for writing custom code to load your model into the memory from local file system or remote model storage, a general good practice is to call the load handler in the model server class __init__ function, so your model is loaded on startup and ready to serve prediction requests. model.py import argparse import base64 import io import time from fastapi.middleware.cors import CORSMiddleware from torchvision import models , transforms from typing import Dict import torch from PIL import Image import kserve from kserve import Model , ModelServer , logging from kserve.model_server import app from kserve.utils.utils import generate_uuid class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name , return_response_headers = True ) super () . __init__ ( name , return_response_headers = True ) self . name = name super () . __init__ ( name , return_response_headers = True ) self . name = name self . load () self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None , response_headers : Dict [ str , str ] = None , ) -> Dict : start = time . time () # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values img_data = payload [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( img_data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ([ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ]), ]) input_tensor = preprocess ( input_image ) . unsqueeze ( 0 ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () end = time . time () response_id = generate_uuid () # Custom response headers can be added to the inference response if response_headers is not None : response_headers . update ( { \"prediction-time-latency\" : f \" { round (( end - start ) * 1000 , 9 ) } \" } ) return { \"predictions\" : result } parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) model = AlexNetModel ( args . model_name ) model . load () # Custom middlewares can be added to the model app . add_middleware ( CORSMiddleware , allow_origins = [ \"*\" ], allow_credentials = True , allow_methods = [ \"*\" ], allow_headers = [ \"*\" ], ) ModelServer () . start ([ model ]) Note return_response_headers=True can be added to return response headers for v1 and v2 endpoints Build Custom Serving Image with BuildPacks \u00b6 Buildpacks allows you to transform your inference code into images that can be deployed on KServe without needing to define the Dockerfile . Buildpacks automatically determines the python application and then install the dependencies from the requirements.txt file, it looks at the Procfile to determine how to start the model server. Here we are showing how to build the serving image manually with pack , you can also choose to use kpack to run the image build on the cloud and continuously build/deploy new versions from your source git repository. You can use pack cli to build and push the custom model server image pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model:v1 docker push ${ DOCKER_USER } /custom-model:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments. Deploy Locally and Test \u00b6 Launch the docker image built from last step. docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]} Deploy the REST Custom Serving Runtime on KServe \u00b6 custom.yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model:v1 In the custom.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username. Arguments \u00b6 You can supply additional command arguments on the container spec to configure the model server. --workers : Spawn the specified number of uvicorn workers(multi-processing) of the model server, the default value is 1, this option is often used to help increase the resource utilization of the container. --http_port : The http port model server is listening on, the default REST port is 8080. --grpc_port : The GRPC Port listened to by the model server. Default is 8081. --max_threads : The max number of gRPC processing threads. Default is 4. --enable_grpc : Enable gRPC for the model server. Default is true. --grpc_max_send_message_length : The max message length for gRPC send message. Default is 8388608 bytes (8 MB). --grpc_max_receive_message_length : The max message length for gRPC receive message. Default is 8388608 bytes (8 MB). --model_name : The model name deployed in the model server, the default name the same as the service name. --max_asyncio_workers : Max number of workers to spawn for python async io loop, by default it is min(32,cpu.limit + 4) . --enable_latency_logging : Whether to log latency metrics per request, the default is True. --configure_logging : Whether to configure KServe and Uvicorn logging, the default is True. --log_config_file : The path of the Python config file configuration to use (can be a json, a yaml file or any other supported file format by python logging module). This file allows to override the default Uvicorn configuration shipped with KServe. The default is None. --access_log_format : A string representing the access log format configuration to use. The functionality is provided by the asgi-logger library and it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). By default is None. enable_latency_logging : whether to log latency metrics per request, the default is True. --enable_docs_url : Enable docs url '/docs' to display Swagger UI. Environment Variables \u00b6 You can supply additional environment variables on the container spec. STORAGE_URI : load a model from a storage system supported by KServe e.g. pvc:// s3:// . This acts the same as storageUri when using a built-in predictor. The data will be available at /mnt/models in the container. For example, the following STORAGE_URI: \"pvc://my_model/model.onnx\" will be accessible at /mnt/models/model.onnx PROTOCOL : specify the protocol version supported by the model e.g V1 . This acts the same as protocolVersion when using a built-in predictor. KSERVE_LOGLEVEL : sets the kserve and kserve_trace 's logger verbosity. Default is INFO . Apply the YAML to deploy the InferenceService on KServe kubectl kubectl apply -f custom.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model created Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output * Trying 169 .47.250.204... * TCP_NODELAY set * Connected to 169 .47.250.204 ( 169 .47.250.204 ) port 80 ( #0) > POST /v1/models/custom-model:predict HTTP/1.1 > Host: custom-model.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 105339 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 232 < content-type: text/html ; charset = UTF-8 < date: Wed, 26 Feb 2020 15 :19:15 GMT < server: istio-envoy < x-envoy-upstream-service-time: 213 < * Connection #0 to host 169.47.250.204 left intact { \"predictions\" : [[ 14 .861762046813965, 13 .942917823791504, 13 .9243803024292, 12 .182711601257324, 12 .00634765625 ]]} Delete the InferenceService \u00b6 kubectl delete -f custom.yaml Create and Deploy Custom gRPC ServingRuntime \u00b6 KServe gRPC ServingRuntimes enables high performance inference data plane which implements the Open(v2) Inference Protocol : gRPC is built on top of HTTP/2 for addressing the shortcomings of head-of-line-blocking and pipelining , gRPC transports binary data format with Protobuf which is efficient to send over the wire. Compared to REST it has limited support for browser and the message is not human-readable which requires additional debugging tools. Setup \u00b6 Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository. Implement Custom Model using KServe API \u00b6 For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. model_grpc.py import argparse import io from typing import Dict import torch from PIL import Image from torchvision import models , transforms from kserve import InferRequest , InferResponse , Model , ModelServer , logging , model_server from kserve.utils.utils import get_predict_response # This custom predictor example implements the custom model following KServe # v2 inference gPPC protocol, the input can be raw image bytes or image tensor # which is pre-processed by transformer and then passed to predictor, the # output is the prediction response. class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name ) self . load () self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : InferRequest , headers : Dict [ str , str ] = None , response_headers : Dict [ str , str ] = None , ) -> InferResponse : req = payload . inputs [ 0 ] if req . datatype == \"BYTES\" : input_image = Image . open ( io . BytesIO ( req . data [ 0 ])) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_tensor = input_tensor . unsqueeze ( 0 ) elif req . datatype == \"FP32\" : np_array = payload . inputs [ 0 ] . as_numpy () input_tensor = torch . Tensor ( np_array ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . detach () . numpy () return get_predict_response ( payload , result , self . name ) parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) model = AlexNetModel ( args . model_name ) model . load () ModelServer () . start ([ model ]) Build Custom Serving Image with BuildPacks \u00b6 Similar to building the REST custom image, you can also use pack cli to build and push the custom gRPC model server image pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model-grpc:v1 docker push ${ DOCKER_USER } /custom-model-grpc:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments. Deploy Locally and Test \u00b6 Launch the docker image built from last step with buildpack . docker run -ePORT = 8081 -p8081:8081 ${ DOCKER_USER } /custom-model-grpc:v1 Send a test inference request locally using InferenceServerClient grpc_client.py grpc_client.py import asyncio import json import base64 import os from kserve import InferRequest , InferInput from kserve.inference_client import InferenceGRPCClient async def main (): client = InferenceGRPCClient ( url = os . environ . get ( \"INGRESS_HOST\" , \"localhost\" ) + \":\" + os . environ . get ( \"INGRESS_PORT\" , \"8081\" ), channel_args = [( 'grpc.ssl_target_name_override' , os . environ . get ( \"SERVICE_HOSTNAME\" , \"\" ))] ) with open ( \"../input.json\" ) as json_file : data = json . load ( json_file ) infer_input = InferInput ( name = \"input-0\" , shape = [ 1 ], datatype = \"BYTES\" , data = [ base64 . b64decode ( data [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ])]) request = InferRequest ( infer_inputs = [ infer_input ], model_name = os . environ . get ( \"MODEL_NAME\" , \"custom-model\" )) res = await client . infer ( infer_request = request ) print ( res ) asyncio . run ( main ()) python grpc_client.py Expected Output \"id\": \"b6a08abf-dcec-42ae-81af-084d9cad1c16\",\"model_name\": \"custom-model\",\"outputs\": [\"name\": \"output-0\",\"shape\": [1, 5],\"datatype\": \"FP32\",\"data\": [14.975618362426758, 14.036808967590332, 13.966032028198242, 12.252279281616211, 12.086268424987793],\"parameters\": {}],\"parameters\": {},\"from_grpc\": True Deploy the gRPC Custom Serving Runtime on KServe \u00b6 Create the InferenceService yaml and expose the gRPC port by specifying on ports section, currently only one port is allowed to expose and by default HTTP port is exposed. custom_grpc.yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model-grpc spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model-grpc:v1 ports : - name : h2c containerPort : 8081 protocol : TCP In the custom_grpc.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username. Arguments \u00b6 You can supply additional command arguments on the container spec to configure the model server. --grpc_port : the http port model server is listening on, the default gRPC port is 8081. --model_name : the model name deployed in the model server, the default name the same as the service name. Apply the yaml to deploy the InferenceService on KServe kubectl kubectl apply -f custom_grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model-grpc created Run a gRPC Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model SERVICE_HOSTNAME = $( kubectl get inferenceservice custom-model-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Send an inference request to the gRPC service using InferenceServerClient grpc_client.py . python grpc_client.py Expected Output \"id\": \"b6a08abf-dcec-42ae-81af-084d9cad1c16\",\"model_name\": \"custom-model\",\"outputs\": [\"name\": \"output-0\",\"shape\": [1, 5],\"datatype\": \"FP32\",\"data\": [14.975618362426758, 14.036808967590332, 13.966032028198242, 12.252279281616211, 12.086268424987793],\"parameters\": {}],\"parameters\": {},\"from_grpc\": True Parallel Model Inference \u00b6 By default, the models are loaded in the same process and inference is executed in the same process as the HTTP or gRPC server, if you are hosting multiple models the inference can only be run for one model at a time which limits the concurrency when you share the container for the models. KServe integrates RayServe which provides a programmable API to deploy models as separate python workers so, the inference can be performed in parallel when serving multiple custom models. Setup \u00b6 Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository. model_remote.py import argparse import base64 import io from typing import Dict from torchvision import models , transforms import torch from PIL import Image from ray import serve from kserve import Model , ModelServer , logging , model_server from kserve.ray import RayModel # the model handle name should match the model endpoint name @serve . deployment ( name = \"custom-model\" , num_replicas = 1 ) class AlexNetModel ( Model ): def __init__ ( self , name ): super () . __init__ ( name ) self . ready = False self . load () def load ( self ): self . model = models . alexnet ( pretrained = True , progress = False ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : inputs = payload [ \"instances\" ] # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values data = inputs [ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_batch = input_tensor . unsqueeze ( 0 ) output = self . model ( input_batch ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) return { \"predictions\" : values . tolist ()} parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) app = AlexNetModel . bind ( name = args . model_name ) handle = serve . run ( app ) model = RayModel ( name = args . model_name , handle = handle ) model . load () ModelServer () . start ([ model ]) Fractional GPU example \u00b6 import argparse import base64 import io from typing import Dict from torchvision import models , transforms import torch from PIL import Image import ray from ray import serve from kserve import Model , ModelServer , logging , model_server from kserve.ray import RayModel # the model handle name should match the model endpoint name @serve . deployment ( name = \"custom-model\" , num_replicas = 1 , ray_actor_options = { \"num_cpus\" : 1 , \"num_gpus\" : 0.5 }) class AlexNetModel ( Model ): def __init__ ( self , name ): super () . __init__ ( name ) self . ready = False self . load () def load ( self ): self . model = models . alexnet ( pretrained = True , progress = False ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : inputs = payload [ \"instances\" ] # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values data = inputs [ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_batch = input_tensor . unsqueeze ( 0 ) output = self . model ( input_batch ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) return { \"predictions\" : values . tolist ()} parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) ray . init ( num_cpus = 2 , num_gpus = 1 ) app = AlexNetModel . bind ( name = args . model_name ) handle = serve . run ( app ) model = RayModel ( name = args . model_name , handle = handle ) model . load () ModelServer () . start ([ model ]) The more details for ray fractional cpu and gpu can be found here . Build Custom Serving Image with BuildPacks \u00b6 You can use pack cli to build the serving image which launches each model as separate python worker and web server routes to the model workers by name. pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model-ray:v1 docker push ${ DOCKER_USER } /custom-model-ray:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments. Deploy Locally and Test \u00b6 Launch the docker image built from last step. docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model-ray:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]} Configuring Logger for Custom Serving Runtime \u00b6 KServe allows users to override the default logger configuration of serving runtime and uvicorn server. The logger configuration can be modified in one of the following ways: 1. Providing logger configuration as a Dict : \u00b6 If you are building a custom serving runtime and want to modify the logger configuration, this method offers the easiest solution. You can supply the logging configuration as a Python Dictionary to the kserve.logging.configure_logging() method. If the logging dictionary is not provided, KServe uses the default configuration KSERVE_LOG_CONFIG . import argparse import kserve from kserve import logging ################################# # Source code # ################################ parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Example Dict config dictConfig = { \"version\" : 1 , \"disable_existing_loggers\" : False , \"formatters\" : { \"kserve\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(filename)s : %(funcName)s (): %(lineno)s %(message)s \" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , }, \"kserve_trace\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(name)s %(message)s \" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , }, \"uvicorn\" : { \"()\" : \"uvicorn.logging.DefaultFormatter\" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(name)s %(levelprefix)s %(message)s \" , \"use_colors\" : None , }, }, \"handlers\" : { \"kserve\" : { \"formatter\" : \"kserve\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, \"kserve_trace\" : { \"formatter\" : \"kserve_trace\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, \"uvicorn\" : { \"formatter\" : \"uvicorn\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, }, \"loggers\" : { \"kserve\" : { \"handlers\" : [ \"kserve\" ], \"level\" : \"INFO\" , \"propagate\" : False , }, \"kserve.trace\" : { \"handlers\" : [ \"kserve_trace\" ], \"level\" : \"INFO\" , \"propagate\" : False , }, \"uvicorn\" : { \"handlers\" : [ \"uvicorn\" ], \"level\" : \"INFO\" , \"propagate\" : False }, }, } if args . configure_logging : logging . configure_logging ( dictConfig ) Note The logger should be configured before doing any actual work. A recommended best practice is to configure the logger in the main, preferably as the first line of code. If the logger is configured later on in the source code, it may lead to inconsistent logger formats. 2. Providing logger configuration as a file: \u00b6 The logger configuration can be provided as a file. If the filename ends with .json , KServe will treat the file as JSON Configuration. If the filename ends with .yaml or .yml , KServe will treat the file as YAML Configuration. Otherwise, The file will be treated as a configuration file in the format specified in the Python logging module documentation . This offers a more flexible way of configuring the logger for pre-built serving runtimes. The model server offers a command line argument which accepts a file path pointing to the configuration. For example, sklearnserver --log_config_file = /path/to/config.yaml For, Custom serving runtimes, they should accept the file path in their source code. import argparse from kserve import logging import kserve ################################# # Source code # ################################ parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) Here is an example logging config in JSON format. { \"version\" : 1 , \"disable_existing_loggers\" : false , \"formatters\" : { \"kserve\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(filename)s:%(funcName)s():%(lineno)s %(message)s\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" }, \"kserve_trace\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(name)s %(message)s\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" }, \"uvicorn\" : { \"()\" : \"uvicorn.logging.DefaultFormatter\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(name)s %(levelprefix)s %(message)s\" , \"use_colors\" : null } }, \"handlers\" : { \"kserve\" : { \"formatter\" : \"kserve\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" }, \"kserve_trace\" : { \"formatter\" : \"kserve_trace\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" }, \"uvicorn\" : { \"formatter\" : \"uvicorn\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" } }, \"loggers\" : { \"kserve\" : { \"handlers\" : [ \"kserve\" ], \"level\" : \"INFO\" , \"propagate\" : false }, \"kserve.trace\" : { \"handlers\" : [ \"kserve_trace\" ], \"level\" : \"INFO\" , \"propagate\" : false }, \"uvicorn\" : { \"handlers\" : [ \"uvicorn\" ], \"level\" : \"INFO\" , \"propagate\" : false } } } Here is an example using YAML format for configuring logger. version : 1 disable_existing_loggers : false formatters : kserve : \"()\" : logging.Formatter fmt : \"%(asctime)s.%(msecs)03d %(filename)s:%(funcName)s():%(lineno)s %(message)s\" datefmt : \"%Y-%m-%d %H:%M:%S\" kserve_trace : \"()\" : logging.Formatter fmt : \"%(asctime)s.%(msecs)03d %(name)s %(message)s\" datefmt : \"%Y-%m-%d %H:%M:%S\" uvicorn : \"()\" : uvicorn.logging.DefaultFormatter datefmt : \"%Y-%m-%d %H:%M:%S\" fmt : \"%(asctime)s.%(msecs)03d %(name)s %(levelprefix)s %(message)s\" use_colors : null handlers : kserve : formatter : kserve class : logging.StreamHandler stream : ext://sys.stderr kserve_trace : formatter : kserve_trace class : logging.StreamHandler stream : ext://sys.stderr uvicorn : formatter : uvicorn class : logging.StreamHandler stream : ext://sys.stderr loggers : kserve : handlers : - kserve level : INFO propagate : false kserve.trace : handlers : - kserve_trace level : INFO propagate : false uvicorn : handlers : - uvicorn level : INFO propagate : false For other file formats, Please refer Python docs . 3. Disabling logger Configuration: \u00b6 If you don't want Kserve to configure the logger then, You can disable it by passing the commandline argument --configure_logging=False to the model server. The command line argument --log_config_file will be ignored, if the logger configuration is disabled. In this case, the logger will inherit the root logger's configuration. sklearnserver --configure_logging = False Note If the logger is not configured at the entrypoint in the serving runtime (i.e. logging.configure_logger() is not invoked), The model server will configure the logger using default configuration. But note that the logger is configured at model server initialization. So any logs before the initialization will use the root logger's configuration.","title":"How to write a custom predictor"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-custom-python-serving-runtime-with-inferenceservice","text":"When the out-of-the-box Serving Runtime does not fit your need, you can choose to build your own model server using KServe ModelServer API to deploy as Custom Serving Runtime on KServe.","title":"Deploy Custom Python Serving Runtime with InferenceService"},{"location":"modelserving/v1beta1/custom/custom_model/#create-and-deploy-custom-rest-servingruntime","text":"","title":"Create and Deploy Custom REST ServingRuntime"},{"location":"modelserving/v1beta1/custom/custom_model/#setup","text":"Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository.","title":"Setup"},{"location":"modelserving/v1beta1/custom/custom_model/#implement-custom-model-using-kserve-api","text":"KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, the predictor handler executes the inference for your model, the postprocess handler then turns the raw prediction result into user-friendly inference response. There is an additional load handler which is used for writing custom code to load your model into the memory from local file system or remote model storage, a general good practice is to call the load handler in the model server class __init__ function, so your model is loaded on startup and ready to serve prediction requests. model.py import argparse import base64 import io import time from fastapi.middleware.cors import CORSMiddleware from torchvision import models , transforms from typing import Dict import torch from PIL import Image import kserve from kserve import Model , ModelServer , logging from kserve.model_server import app from kserve.utils.utils import generate_uuid class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name , return_response_headers = True ) super () . __init__ ( name , return_response_headers = True ) self . name = name super () . __init__ ( name , return_response_headers = True ) self . name = name self . load () self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None , response_headers : Dict [ str , str ] = None , ) -> Dict : start = time . time () # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values img_data = payload [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( img_data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ([ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ]), ]) input_tensor = preprocess ( input_image ) . unsqueeze ( 0 ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () end = time . time () response_id = generate_uuid () # Custom response headers can be added to the inference response if response_headers is not None : response_headers . update ( { \"prediction-time-latency\" : f \" { round (( end - start ) * 1000 , 9 ) } \" } ) return { \"predictions\" : result } parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) model = AlexNetModel ( args . model_name ) model . load () # Custom middlewares can be added to the model app . add_middleware ( CORSMiddleware , allow_origins = [ \"*\" ], allow_credentials = True , allow_methods = [ \"*\" ], allow_headers = [ \"*\" ], ) ModelServer () . start ([ model ]) Note return_response_headers=True can be added to return response headers for v1 and v2 endpoints","title":"Implement Custom Model using KServe API"},{"location":"modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks","text":"Buildpacks allows you to transform your inference code into images that can be deployed on KServe without needing to define the Dockerfile . Buildpacks automatically determines the python application and then install the dependencies from the requirements.txt file, it looks at the Procfile to determine how to start the model server. Here we are showing how to build the serving image manually with pack , you can also choose to use kpack to run the image build on the cloud and continuously build/deploy new versions from your source git repository. You can use pack cli to build and push the custom model server image pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model:v1 docker push ${ DOCKER_USER } /custom-model:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments.","title":"Build Custom Serving Image with BuildPacks"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-locally-and-test","text":"Launch the docker image built from last step. docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]}","title":"Deploy Locally and Test"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-the-rest-custom-serving-runtime-on-kserve","text":"custom.yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model:v1 In the custom.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username.","title":"Deploy the REST Custom Serving Runtime on KServe"},{"location":"modelserving/v1beta1/custom/custom_model/#arguments","text":"You can supply additional command arguments on the container spec to configure the model server. --workers : Spawn the specified number of uvicorn workers(multi-processing) of the model server, the default value is 1, this option is often used to help increase the resource utilization of the container. --http_port : The http port model server is listening on, the default REST port is 8080. --grpc_port : The GRPC Port listened to by the model server. Default is 8081. --max_threads : The max number of gRPC processing threads. Default is 4. --enable_grpc : Enable gRPC for the model server. Default is true. --grpc_max_send_message_length : The max message length for gRPC send message. Default is 8388608 bytes (8 MB). --grpc_max_receive_message_length : The max message length for gRPC receive message. Default is 8388608 bytes (8 MB). --model_name : The model name deployed in the model server, the default name the same as the service name. --max_asyncio_workers : Max number of workers to spawn for python async io loop, by default it is min(32,cpu.limit + 4) . --enable_latency_logging : Whether to log latency metrics per request, the default is True. --configure_logging : Whether to configure KServe and Uvicorn logging, the default is True. --log_config_file : The path of the Python config file configuration to use (can be a json, a yaml file or any other supported file format by python logging module). This file allows to override the default Uvicorn configuration shipped with KServe. The default is None. --access_log_format : A string representing the access log format configuration to use. The functionality is provided by the asgi-logger library and it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). By default is None. enable_latency_logging : whether to log latency metrics per request, the default is True. --enable_docs_url : Enable docs url '/docs' to display Swagger UI.","title":"Arguments"},{"location":"modelserving/v1beta1/custom/custom_model/#environment-variables","text":"You can supply additional environment variables on the container spec. STORAGE_URI : load a model from a storage system supported by KServe e.g. pvc:// s3:// . This acts the same as storageUri when using a built-in predictor. The data will be available at /mnt/models in the container. For example, the following STORAGE_URI: \"pvc://my_model/model.onnx\" will be accessible at /mnt/models/model.onnx PROTOCOL : specify the protocol version supported by the model e.g V1 . This acts the same as protocolVersion when using a built-in predictor. KSERVE_LOGLEVEL : sets the kserve and kserve_trace 's logger verbosity. Default is INFO . Apply the YAML to deploy the InferenceService on KServe kubectl kubectl apply -f custom.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model created","title":"Environment Variables"},{"location":"modelserving/v1beta1/custom/custom_model/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output * Trying 169 .47.250.204... * TCP_NODELAY set * Connected to 169 .47.250.204 ( 169 .47.250.204 ) port 80 ( #0) > POST /v1/models/custom-model:predict HTTP/1.1 > Host: custom-model.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 105339 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 232 < content-type: text/html ; charset = UTF-8 < date: Wed, 26 Feb 2020 15 :19:15 GMT < server: istio-envoy < x-envoy-upstream-service-time: 213 < * Connection #0 to host 169.47.250.204 left intact { \"predictions\" : [[ 14 .861762046813965, 13 .942917823791504, 13 .9243803024292, 12 .182711601257324, 12 .00634765625 ]]}","title":"Run a Prediction"},{"location":"modelserving/v1beta1/custom/custom_model/#delete-the-inferenceservice","text":"kubectl delete -f custom.yaml","title":"Delete the InferenceService"},{"location":"modelserving/v1beta1/custom/custom_model/#create-and-deploy-custom-grpc-servingruntime","text":"KServe gRPC ServingRuntimes enables high performance inference data plane which implements the Open(v2) Inference Protocol : gRPC is built on top of HTTP/2 for addressing the shortcomings of head-of-line-blocking and pipelining , gRPC transports binary data format with Protobuf which is efficient to send over the wire. Compared to REST it has limited support for browser and the message is not human-readable which requires additional debugging tools.","title":"Create and Deploy Custom gRPC ServingRuntime"},{"location":"modelserving/v1beta1/custom/custom_model/#setup_1","text":"Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository.","title":"Setup"},{"location":"modelserving/v1beta1/custom/custom_model/#implement-custom-model-using-kserve-api_1","text":"For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. model_grpc.py import argparse import io from typing import Dict import torch from PIL import Image from torchvision import models , transforms from kserve import InferRequest , InferResponse , Model , ModelServer , logging , model_server from kserve.utils.utils import get_predict_response # This custom predictor example implements the custom model following KServe # v2 inference gPPC protocol, the input can be raw image bytes or image tensor # which is pre-processed by transformer and then passed to predictor, the # output is the prediction response. class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name ) self . load () self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : InferRequest , headers : Dict [ str , str ] = None , response_headers : Dict [ str , str ] = None , ) -> InferResponse : req = payload . inputs [ 0 ] if req . datatype == \"BYTES\" : input_image = Image . open ( io . BytesIO ( req . data [ 0 ])) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_tensor = input_tensor . unsqueeze ( 0 ) elif req . datatype == \"FP32\" : np_array = payload . inputs [ 0 ] . as_numpy () input_tensor = torch . Tensor ( np_array ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . detach () . numpy () return get_predict_response ( payload , result , self . name ) parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) model = AlexNetModel ( args . model_name ) model . load () ModelServer () . start ([ model ])","title":"Implement Custom Model using KServe API"},{"location":"modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks_1","text":"Similar to building the REST custom image, you can also use pack cli to build and push the custom gRPC model server image pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model-grpc:v1 docker push ${ DOCKER_USER } /custom-model-grpc:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments.","title":"Build Custom Serving Image with BuildPacks"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-locally-and-test_1","text":"Launch the docker image built from last step with buildpack . docker run -ePORT = 8081 -p8081:8081 ${ DOCKER_USER } /custom-model-grpc:v1 Send a test inference request locally using InferenceServerClient grpc_client.py grpc_client.py import asyncio import json import base64 import os from kserve import InferRequest , InferInput from kserve.inference_client import InferenceGRPCClient async def main (): client = InferenceGRPCClient ( url = os . environ . get ( \"INGRESS_HOST\" , \"localhost\" ) + \":\" + os . environ . get ( \"INGRESS_PORT\" , \"8081\" ), channel_args = [( 'grpc.ssl_target_name_override' , os . environ . get ( \"SERVICE_HOSTNAME\" , \"\" ))] ) with open ( \"../input.json\" ) as json_file : data = json . load ( json_file ) infer_input = InferInput ( name = \"input-0\" , shape = [ 1 ], datatype = \"BYTES\" , data = [ base64 . b64decode ( data [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ])]) request = InferRequest ( infer_inputs = [ infer_input ], model_name = os . environ . get ( \"MODEL_NAME\" , \"custom-model\" )) res = await client . infer ( infer_request = request ) print ( res ) asyncio . run ( main ()) python grpc_client.py Expected Output \"id\": \"b6a08abf-dcec-42ae-81af-084d9cad1c16\",\"model_name\": \"custom-model\",\"outputs\": [\"name\": \"output-0\",\"shape\": [1, 5],\"datatype\": \"FP32\",\"data\": [14.975618362426758, 14.036808967590332, 13.966032028198242, 12.252279281616211, 12.086268424987793],\"parameters\": {}],\"parameters\": {},\"from_grpc\": True","title":"Deploy Locally and Test"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-the-grpc-custom-serving-runtime-on-kserve","text":"Create the InferenceService yaml and expose the gRPC port by specifying on ports section, currently only one port is allowed to expose and by default HTTP port is exposed. custom_grpc.yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model-grpc spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model-grpc:v1 ports : - name : h2c containerPort : 8081 protocol : TCP In the custom_grpc.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username.","title":"Deploy the gRPC Custom Serving Runtime on KServe"},{"location":"modelserving/v1beta1/custom/custom_model/#arguments_1","text":"You can supply additional command arguments on the container spec to configure the model server. --grpc_port : the http port model server is listening on, the default gRPC port is 8081. --model_name : the model name deployed in the model server, the default name the same as the service name. Apply the yaml to deploy the InferenceService on KServe kubectl kubectl apply -f custom_grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model-grpc created","title":"Arguments"},{"location":"modelserving/v1beta1/custom/custom_model/#run-a-grpc-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model SERVICE_HOSTNAME = $( kubectl get inferenceservice custom-model-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Send an inference request to the gRPC service using InferenceServerClient grpc_client.py . python grpc_client.py Expected Output \"id\": \"b6a08abf-dcec-42ae-81af-084d9cad1c16\",\"model_name\": \"custom-model\",\"outputs\": [\"name\": \"output-0\",\"shape\": [1, 5],\"datatype\": \"FP32\",\"data\": [14.975618362426758, 14.036808967590332, 13.966032028198242, 12.252279281616211, 12.086268424987793],\"parameters\": {}],\"parameters\": {},\"from_grpc\": True","title":"Run a gRPC Prediction"},{"location":"modelserving/v1beta1/custom/custom_model/#parallel-model-inference","text":"By default, the models are loaded in the same process and inference is executed in the same process as the HTTP or gRPC server, if you are hosting multiple models the inference can only be run for one model at a time which limits the concurrency when you share the container for the models. KServe integrates RayServe which provides a programmable API to deploy models as separate python workers so, the inference can be performed in parallel when serving multiple custom models.","title":"Parallel Model Inference"},{"location":"modelserving/v1beta1/custom/custom_model/#setup_2","text":"Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository. model_remote.py import argparse import base64 import io from typing import Dict from torchvision import models , transforms import torch from PIL import Image from ray import serve from kserve import Model , ModelServer , logging , model_server from kserve.ray import RayModel # the model handle name should match the model endpoint name @serve . deployment ( name = \"custom-model\" , num_replicas = 1 ) class AlexNetModel ( Model ): def __init__ ( self , name ): super () . __init__ ( name ) self . ready = False self . load () def load ( self ): self . model = models . alexnet ( pretrained = True , progress = False ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : inputs = payload [ \"instances\" ] # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values data = inputs [ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_batch = input_tensor . unsqueeze ( 0 ) output = self . model ( input_batch ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) return { \"predictions\" : values . tolist ()} parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) app = AlexNetModel . bind ( name = args . model_name ) handle = serve . run ( app ) model = RayModel ( name = args . model_name , handle = handle ) model . load () ModelServer () . start ([ model ])","title":"Setup"},{"location":"modelserving/v1beta1/custom/custom_model/#fractional-gpu-example","text":"import argparse import base64 import io from typing import Dict from torchvision import models , transforms import torch from PIL import Image import ray from ray import serve from kserve import Model , ModelServer , logging , model_server from kserve.ray import RayModel # the model handle name should match the model endpoint name @serve . deployment ( name = \"custom-model\" , num_replicas = 1 , ray_actor_options = { \"num_cpus\" : 1 , \"num_gpus\" : 0.5 }) class AlexNetModel ( Model ): def __init__ ( self , name ): super () . __init__ ( name ) self . ready = False self . load () def load ( self ): self . model = models . alexnet ( pretrained = True , progress = False ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : inputs = payload [ \"instances\" ] # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values data = inputs [ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_batch = input_tensor . unsqueeze ( 0 ) output = self . model ( input_batch ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) return { \"predictions\" : values . tolist ()} parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) ray . init ( num_cpus = 2 , num_gpus = 1 ) app = AlexNetModel . bind ( name = args . model_name ) handle = serve . run ( app ) model = RayModel ( name = args . model_name , handle = handle ) model . load () ModelServer () . start ([ model ]) The more details for ray fractional cpu and gpu can be found here .","title":"Fractional GPU example"},{"location":"modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks_2","text":"You can use pack cli to build the serving image which launches each model as separate python worker and web server routes to the model workers by name. pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model-ray:v1 docker push ${ DOCKER_USER } /custom-model-ray:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments.","title":"Build Custom Serving Image with BuildPacks"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-locally-and-test_2","text":"Launch the docker image built from last step. docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model-ray:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]}","title":"Deploy Locally and Test"},{"location":"modelserving/v1beta1/custom/custom_model/#configuring-logger-for-custom-serving-runtime","text":"KServe allows users to override the default logger configuration of serving runtime and uvicorn server. The logger configuration can be modified in one of the following ways:","title":"Configuring Logger for Custom Serving Runtime"},{"location":"modelserving/v1beta1/custom/custom_model/#1-providing-logger-configuration-as-a-dict","text":"If you are building a custom serving runtime and want to modify the logger configuration, this method offers the easiest solution. You can supply the logging configuration as a Python Dictionary to the kserve.logging.configure_logging() method. If the logging dictionary is not provided, KServe uses the default configuration KSERVE_LOG_CONFIG . import argparse import kserve from kserve import logging ################################# # Source code # ################################ parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Example Dict config dictConfig = { \"version\" : 1 , \"disable_existing_loggers\" : False , \"formatters\" : { \"kserve\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(filename)s : %(funcName)s (): %(lineno)s %(message)s \" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , }, \"kserve_trace\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(name)s %(message)s \" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , }, \"uvicorn\" : { \"()\" : \"uvicorn.logging.DefaultFormatter\" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(name)s %(levelprefix)s %(message)s \" , \"use_colors\" : None , }, }, \"handlers\" : { \"kserve\" : { \"formatter\" : \"kserve\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, \"kserve_trace\" : { \"formatter\" : \"kserve_trace\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, \"uvicorn\" : { \"formatter\" : \"uvicorn\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, }, \"loggers\" : { \"kserve\" : { \"handlers\" : [ \"kserve\" ], \"level\" : \"INFO\" , \"propagate\" : False , }, \"kserve.trace\" : { \"handlers\" : [ \"kserve_trace\" ], \"level\" : \"INFO\" , \"propagate\" : False , }, \"uvicorn\" : { \"handlers\" : [ \"uvicorn\" ], \"level\" : \"INFO\" , \"propagate\" : False }, }, } if args . configure_logging : logging . configure_logging ( dictConfig ) Note The logger should be configured before doing any actual work. A recommended best practice is to configure the logger in the main, preferably as the first line of code. If the logger is configured later on in the source code, it may lead to inconsistent logger formats.","title":"1. Providing logger configuration as a Dict:"},{"location":"modelserving/v1beta1/custom/custom_model/#2-providing-logger-configuration-as-a-file","text":"The logger configuration can be provided as a file. If the filename ends with .json , KServe will treat the file as JSON Configuration. If the filename ends with .yaml or .yml , KServe will treat the file as YAML Configuration. Otherwise, The file will be treated as a configuration file in the format specified in the Python logging module documentation . This offers a more flexible way of configuring the logger for pre-built serving runtimes. The model server offers a command line argument which accepts a file path pointing to the configuration. For example, sklearnserver --log_config_file = /path/to/config.yaml For, Custom serving runtimes, they should accept the file path in their source code. import argparse from kserve import logging import kserve ################################# # Source code # ################################ parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) Here is an example logging config in JSON format. { \"version\" : 1 , \"disable_existing_loggers\" : false , \"formatters\" : { \"kserve\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(filename)s:%(funcName)s():%(lineno)s %(message)s\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" }, \"kserve_trace\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(name)s %(message)s\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" }, \"uvicorn\" : { \"()\" : \"uvicorn.logging.DefaultFormatter\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(name)s %(levelprefix)s %(message)s\" , \"use_colors\" : null } }, \"handlers\" : { \"kserve\" : { \"formatter\" : \"kserve\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" }, \"kserve_trace\" : { \"formatter\" : \"kserve_trace\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" }, \"uvicorn\" : { \"formatter\" : \"uvicorn\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" } }, \"loggers\" : { \"kserve\" : { \"handlers\" : [ \"kserve\" ], \"level\" : \"INFO\" , \"propagate\" : false }, \"kserve.trace\" : { \"handlers\" : [ \"kserve_trace\" ], \"level\" : \"INFO\" , \"propagate\" : false }, \"uvicorn\" : { \"handlers\" : [ \"uvicorn\" ], \"level\" : \"INFO\" , \"propagate\" : false } } } Here is an example using YAML format for configuring logger. version : 1 disable_existing_loggers : false formatters : kserve : \"()\" : logging.Formatter fmt : \"%(asctime)s.%(msecs)03d %(filename)s:%(funcName)s():%(lineno)s %(message)s\" datefmt : \"%Y-%m-%d %H:%M:%S\" kserve_trace : \"()\" : logging.Formatter fmt : \"%(asctime)s.%(msecs)03d %(name)s %(message)s\" datefmt : \"%Y-%m-%d %H:%M:%S\" uvicorn : \"()\" : uvicorn.logging.DefaultFormatter datefmt : \"%Y-%m-%d %H:%M:%S\" fmt : \"%(asctime)s.%(msecs)03d %(name)s %(levelprefix)s %(message)s\" use_colors : null handlers : kserve : formatter : kserve class : logging.StreamHandler stream : ext://sys.stderr kserve_trace : formatter : kserve_trace class : logging.StreamHandler stream : ext://sys.stderr uvicorn : formatter : uvicorn class : logging.StreamHandler stream : ext://sys.stderr loggers : kserve : handlers : - kserve level : INFO propagate : false kserve.trace : handlers : - kserve_trace level : INFO propagate : false uvicorn : handlers : - uvicorn level : INFO propagate : false For other file formats, Please refer Python docs .","title":"2. Providing logger configuration as a file:"},{"location":"modelserving/v1beta1/custom/custom_model/#3-disabling-logger-configuration","text":"If you don't want Kserve to configure the logger then, You can disable it by passing the commandline argument --configure_logging=False to the model server. The command line argument --log_config_file will be ignored, if the logger configuration is disabled. In this case, the logger will inherit the root logger's configuration. sklearnserver --configure_logging = False Note If the logger is not configured at the entrypoint in the serving runtime (i.e. logging.configure_logger() is not invoked), The model server will configure the logger using default configuration. But note that the logger is configured at model server initialization. So any logs before the initialization will use the root logger's configuration.","title":"3. Disabling logger Configuration:"},{"location":"modelserving/v1beta1/lightgbm/","text":"Deploy LightGBM model with InferenceService \u00b6 Train a LightGBM model \u00b6 To test the LightGBM Server, first you need to train a simple LightGBM model with following python code. import lightgbm as lgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = lgb . Dataset ( X , label = y , feature_names = iris [ 'feature_names' ]) params = { 'objective' : 'multiclass' , 'metric' : 'softmax' , 'num_class' : 3 } lgb_model = lgb . train ( params = params , train_set = dtrain ) model_file = os . path . join ( model_dir , BST_FILE ) lgb_model . save_model ( model_file ) Deploy LightGBM model with V1 protocol \u00b6 Test the model locally \u00b6 Install and run the LightGBM Server using the trained model locally and test the prediction. python -m lgbserver --model_dir /path/to/model_dir --model_name lgb After the LightGBM Server is up locally we can then test the model by sending an inference request. import requests request = { 'sepal_width_(cm)' : { 0 : 3.5 }, 'petal_length_(cm)' : { 0 : 1.4 }, 'petal_width_(cm)' : { 0 : 0.2 }, 'sepal_length_(cm)' : { 0 : 5.1 } } formData = { 'inputs' : [ request ] } res = requests . post ( 'http://localhost:8080/v1/models/lgb:predict' , json = formData ) print ( res ) print ( res . text ) Deploy with InferenceService \u00b6 To deploy the model on Kubernetes you can create the InferenceService by specifying the modelFormat with lightgbm and storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : model : modelFormat : name : lightgbm storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : lightgbm : storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" Apply the above yaml to create the InferenceService kubectl apply -f lightgbm.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-iris created Test the deployed model \u00b6 To test the deployed model the first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , then run the following curl command to send the inference request to the InferenceService . MODEL_NAME = lightgbm-iris INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 169 .63.251.68... * TCP_NODELAY set * Connected to 169 .63.251.68 ( 169 .63.251.68 ) port 80 ( #0) > POST /models/lightgbm-iris:predict HTTP/1.1 > Host: lightgbm-iris.default.svc.cluster.local > User-Agent: curl/7.60.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes < HTTP/1.1 200 OK < content-length: 27 < content-type: application/json ; charset = UTF-8 < date: Tue, 21 May 2019 22 :40:09 GMT < server: istio-envoy < x-envoy-upstream-service-time: 13032 < * Connection #0 to host 169.63.251.68 left intact { \"predictions\" : [[ 0 .9, 0 .05, 0 .05 ]]} Deploy the model with Open Inference Protocol \u00b6 Test the model locally \u00b6 Once you've got your model serialized model.bst , we can then use KServe LightGBM Server to create a local model server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use kserve lightgbm server locally, you will first need to install the lgbserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install lgbserver runtime. KServe uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/lgbserver poetry install Serving model locally \u00b6 The lgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the lgbserver runtime package installed locally, you should now be ready to start our server as: python3 lgbserver --model_dir /path/to/model_dir --model_name lightgbm-v2-iris Deploy InferenceService with REST endpoint \u00b6 To deploy the LightGBM model with Open Inference Protocol, you need to set the protocolVersion field to v2 . Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris\" spec : predictor : model : modelFormat : name : lightgbm runtime : kserve-lgbserver protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f lightgbm-v2.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-v2-iris created Test the deployed model with curl \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/lightgbm-v2-iris/infer Expected Output { \"model_name\" : \"lightgbm-v2-iris\" , \"model_version\" : null , \"id\" : \"96253e27-83cf-4262-b279-1bd4b18d7922\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 2 , 3 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 8.796664107010673e-06 , 0.9992300031041593 , 0.0007612002317336916 , 4.974786820804187e-06 , 0.9999919650711493 , 3.0601420299625077e-06 ] } ] } Create the InferenceService with gRPC endpoint \u00b6 Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f lightgbm-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follow the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"lightgbm-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Sun, 25 Sep 2022 10 :25:05 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 99 Estimated response size: 91 bytes Response contents: { \"modelName\" : \"lightgbm-v2-iris-grpc\" , \"outputs\" : [ { \"name\" : \"predict\" , \"datatype\" : \"FP64\" , \"shape\" : [ \"2\" , \"3\" ] , \"contents\" : { \"fp64Contents\" : [ 8 .796664107010673e-06, 0 .9992300031041593, 0 .0007612002317336916, 4 .974786820804187e-06, 0 .9999919650711493, 3 .0601420299625077e-06 ] } } ] }","title":"LightGBM"},{"location":"modelserving/v1beta1/lightgbm/#deploy-lightgbm-model-with-inferenceservice","text":"","title":"Deploy LightGBM model with InferenceService"},{"location":"modelserving/v1beta1/lightgbm/#train-a-lightgbm-model","text":"To test the LightGBM Server, first you need to train a simple LightGBM model with following python code. import lightgbm as lgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = lgb . Dataset ( X , label = y , feature_names = iris [ 'feature_names' ]) params = { 'objective' : 'multiclass' , 'metric' : 'softmax' , 'num_class' : 3 } lgb_model = lgb . train ( params = params , train_set = dtrain ) model_file = os . path . join ( model_dir , BST_FILE ) lgb_model . save_model ( model_file )","title":"Train a LightGBM model"},{"location":"modelserving/v1beta1/lightgbm/#deploy-lightgbm-model-with-v1-protocol","text":"","title":"Deploy LightGBM model with V1 protocol"},{"location":"modelserving/v1beta1/lightgbm/#test-the-model-locally","text":"Install and run the LightGBM Server using the trained model locally and test the prediction. python -m lgbserver --model_dir /path/to/model_dir --model_name lgb After the LightGBM Server is up locally we can then test the model by sending an inference request. import requests request = { 'sepal_width_(cm)' : { 0 : 3.5 }, 'petal_length_(cm)' : { 0 : 1.4 }, 'petal_width_(cm)' : { 0 : 0.2 }, 'sepal_length_(cm)' : { 0 : 5.1 } } formData = { 'inputs' : [ request ] } res = requests . post ( 'http://localhost:8080/v1/models/lgb:predict' , json = formData ) print ( res ) print ( res . text )","title":"Test the model locally"},{"location":"modelserving/v1beta1/lightgbm/#deploy-with-inferenceservice","text":"To deploy the model on Kubernetes you can create the InferenceService by specifying the modelFormat with lightgbm and storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : model : modelFormat : name : lightgbm storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : lightgbm : storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" Apply the above yaml to create the InferenceService kubectl apply -f lightgbm.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-iris created","title":"Deploy with InferenceService"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model","text":"To test the deployed model the first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , then run the following curl command to send the inference request to the InferenceService . MODEL_NAME = lightgbm-iris INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 169 .63.251.68... * TCP_NODELAY set * Connected to 169 .63.251.68 ( 169 .63.251.68 ) port 80 ( #0) > POST /models/lightgbm-iris:predict HTTP/1.1 > Host: lightgbm-iris.default.svc.cluster.local > User-Agent: curl/7.60.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes < HTTP/1.1 200 OK < content-length: 27 < content-type: application/json ; charset = UTF-8 < date: Tue, 21 May 2019 22 :40:09 GMT < server: istio-envoy < x-envoy-upstream-service-time: 13032 < * Connection #0 to host 169.63.251.68 left intact { \"predictions\" : [[ 0 .9, 0 .05, 0 .05 ]]}","title":"Test the deployed model"},{"location":"modelserving/v1beta1/lightgbm/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/lightgbm/#test-the-model-locally_1","text":"Once you've got your model serialized model.bst , we can then use KServe LightGBM Server to create a local model server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the model locally"},{"location":"modelserving/v1beta1/lightgbm/#pre-requisites","text":"Firstly, to use kserve lightgbm server locally, you will first need to install the lgbserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install lgbserver runtime. KServe uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/lgbserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/lightgbm/#serving-model-locally","text":"The lgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the lgbserver runtime package installed locally, you should now be ready to start our server as: python3 lgbserver --model_dir /path/to/model_dir --model_name lightgbm-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/lightgbm/#deploy-inferenceservice-with-rest-endpoint","text":"To deploy the LightGBM model with Open Inference Protocol, you need to set the protocolVersion field to v2 . Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris\" spec : predictor : model : modelFormat : name : lightgbm runtime : kserve-lgbserver protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f lightgbm-v2.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-v2-iris created","title":"Deploy InferenceService with REST endpoint"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model-with-curl","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/lightgbm-v2-iris/infer Expected Output { \"model_name\" : \"lightgbm-v2-iris\" , \"model_version\" : null , \"id\" : \"96253e27-83cf-4262-b279-1bd4b18d7922\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 2 , 3 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 8.796664107010673e-06 , 0.9992300031041593 , 0.0007612002317336916 , 4.974786820804187e-06 , 0.9999919650711493 , 3.0601420299625077e-06 ] } ] }","title":"Test the deployed model with curl"},{"location":"modelserving/v1beta1/lightgbm/#create-the-inferenceservice-with-grpc-endpoint","text":"Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f lightgbm-v2-grpc.yaml","title":"Create the InferenceService with gRPC endpoint"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follow the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"lightgbm-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Sun, 25 Sep 2022 10 :25:05 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 99 Estimated response size: 91 bytes Response contents: { \"modelName\" : \"lightgbm-v2-iris-grpc\" , \"outputs\" : [ { \"name\" : \"predict\" , \"datatype\" : \"FP64\" , \"shape\" : [ \"2\" , \"3\" ] , \"contents\" : { \"fp64Contents\" : [ 8 .796664107010673e-06, 0 .9992300031041593, 0 .0007612002317336916, 4 .974786820804187e-06, 0 .9999919650711493, 3 .0601420299625077e-06 ] } } ] }","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/llm/huggingface/","text":"Hugging Face LLM Serving Runtime \u00b6 The Hugging Face serving runtime implements two backends namely Hugging Face and vLLM that can serve Hugging Face models out of the box. The preprocess and post-process handlers are already implemented based on different ML tasks, for example text classification, token-classification, text-generation, text2text-generation, fill-mask. KServe Hugging Face runtime by default uses vLLM backend to serve text generation and text2text generation LLM models for faster time-to-first-token (TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as PagedAttention , continuous batching and an optimized CUDA kernel. If the model is not supported by the vLLM engine, KServe falls back to the Hugging Face backend as a failsafe. Supported ML Tasks \u00b6 The Hugging Face runtime supports the following ML tasks: Text Generation Text2Text Generation Fill Mask Token Classification Sequence Classification (Text Classification) For information on the models supported by the vLLM backend, please visit vLLM's documentation . API Endpoints \u00b6 Both of the backends support serving generative models (text generation and text2text generation) using OpenAI's Completion and Chat Completion API. The other types of tasks like token classification, sequence classification, and fill mask are served using KServe's Open Inference Protocol or V1 API . Examples \u00b6 The following examples demonstrate how to deploy and perform inference using the Hugging Face runtime with different ML tasks: Text Generation using LLama3 Text2Text Generation using T5 Token Classification using BERT Sequence Classification (Text Classification) using distilBERT Fill Mask using BERT SDK Integration Note The Hugging Face runtime image has the following environment variables set by default: SAFETENSORS_FAST_GPU is set by default to improve the model loading performance. HF_HUB_DISABLE_TELEMETRY is set by default to disable the telemetry. Hugging Face Runtime Arguments \u00b6 Below, you can find an explanation of command line arguments which are supported by the Hugging Face runtime. vLLM backend engine arguments can also be specified on the command line and will be parsed by the Hugging Face runtime. --model_name : The name of the model used on the endpoint path. --model_dir : The local path where the model is downloaded to. If model_id is provided, this argument will be ignored. --model_id : Hugging Face model id. --model_revision : Hugging Face model revision. --tokenizer_revision : Hugging Face tokenizer revision. --dtype : Data type to load the weights in. One of 'auto', 'float16', 'float32', 'bfloat16', 'float', 'half'. Defaults to float16 for GPU and float32 for CPU systems. 'auto' uses float16 if GPU is available and uses float32 otherwise to ensure consistency between vLLM and HuggingFace backends. Encoder models defaults to 'float32'. 'float' is shorthand for 'float32'. 'half' is 'float16'. The rest are as the name reads. --task : The ML task name. Can be one of 'text_generation', 'text2text_generation', 'fill_mask', 'token_classification', 'sequence_classification'. If not provided, model server will try infer the task from model architecture. --backend : The backend to use to load the model. Can be one of 'auto', 'huggingface', 'vllm'. --max_length : Max sequence length for the tokenizer. --disable_lower_case : Disable lower case for the tokenizer. --disable_special_tokens : The sequences will not be encoded with the special tokens relative to the model. --trust_remote_code : Allow loading of models and tokenizers with custom code. --tensor_input_names : The tensor input names passed to the model for triton inference server backend. --return_token_type_ids : Return token type ids. --return_probabilities : Return probabilities of predicted indexes. This is only applicable for tasks 'sequence_classification', 'token_classification' and 'fill_mask'.","title":"Overview"},{"location":"modelserving/v1beta1/llm/huggingface/#hugging-face-llm-serving-runtime","text":"The Hugging Face serving runtime implements two backends namely Hugging Face and vLLM that can serve Hugging Face models out of the box. The preprocess and post-process handlers are already implemented based on different ML tasks, for example text classification, token-classification, text-generation, text2text-generation, fill-mask. KServe Hugging Face runtime by default uses vLLM backend to serve text generation and text2text generation LLM models for faster time-to-first-token (TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as PagedAttention , continuous batching and an optimized CUDA kernel. If the model is not supported by the vLLM engine, KServe falls back to the Hugging Face backend as a failsafe.","title":"Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/#supported-ml-tasks","text":"The Hugging Face runtime supports the following ML tasks: Text Generation Text2Text Generation Fill Mask Token Classification Sequence Classification (Text Classification) For information on the models supported by the vLLM backend, please visit vLLM's documentation .","title":"Supported ML Tasks"},{"location":"modelserving/v1beta1/llm/huggingface/#api-endpoints","text":"Both of the backends support serving generative models (text generation and text2text generation) using OpenAI's Completion and Chat Completion API. The other types of tasks like token classification, sequence classification, and fill mask are served using KServe's Open Inference Protocol or V1 API .","title":"API Endpoints"},{"location":"modelserving/v1beta1/llm/huggingface/#examples","text":"The following examples demonstrate how to deploy and perform inference using the Hugging Face runtime with different ML tasks: Text Generation using LLama3 Text2Text Generation using T5 Token Classification using BERT Sequence Classification (Text Classification) using distilBERT Fill Mask using BERT SDK Integration Note The Hugging Face runtime image has the following environment variables set by default: SAFETENSORS_FAST_GPU is set by default to improve the model loading performance. HF_HUB_DISABLE_TELEMETRY is set by default to disable the telemetry.","title":"Examples"},{"location":"modelserving/v1beta1/llm/huggingface/#hugging-face-runtime-arguments","text":"Below, you can find an explanation of command line arguments which are supported by the Hugging Face runtime. vLLM backend engine arguments can also be specified on the command line and will be parsed by the Hugging Face runtime. --model_name : The name of the model used on the endpoint path. --model_dir : The local path where the model is downloaded to. If model_id is provided, this argument will be ignored. --model_id : Hugging Face model id. --model_revision : Hugging Face model revision. --tokenizer_revision : Hugging Face tokenizer revision. --dtype : Data type to load the weights in. One of 'auto', 'float16', 'float32', 'bfloat16', 'float', 'half'. Defaults to float16 for GPU and float32 for CPU systems. 'auto' uses float16 if GPU is available and uses float32 otherwise to ensure consistency between vLLM and HuggingFace backends. Encoder models defaults to 'float32'. 'float' is shorthand for 'float32'. 'half' is 'float16'. The rest are as the name reads. --task : The ML task name. Can be one of 'text_generation', 'text2text_generation', 'fill_mask', 'token_classification', 'sequence_classification'. If not provided, model server will try infer the task from model architecture. --backend : The backend to use to load the model. Can be one of 'auto', 'huggingface', 'vllm'. --max_length : Max sequence length for the tokenizer. --disable_lower_case : Disable lower case for the tokenizer. --disable_special_tokens : The sequences will not be encoded with the special tokens relative to the model. --trust_remote_code : Allow loading of models and tokenizers with custom code. --tensor_input_names : The tensor input names passed to the model for triton inference server backend. --return_token_type_ids : Return token type ids. --return_probabilities : Return probabilities of predicted indexes. This is only applicable for tasks 'sequence_classification', 'token_classification' and 'fill_mask'.","title":"Hugging Face Runtime Arguments"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/","text":"Deploy the BERT model for fill mask task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy BERT model for fill mask task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using V1 Protocol \u00b6 First, We will deploy the BERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=google-bert/bert-base-uncased resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"The capital of France is [MASK].\", \"The capital of [MASK] is paris.\"]}' Expected Output { \"predictions\" :[ \"paris\" , \"france\" ]} Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol) \u00b6 First, We will deploy the BERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=bert - --model_id=google-bert/bert-base-uncased resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"The capital of France is [MASK].\", \"The capital of [MASK] is paris.\"]}]}' Expected Output { \"model_name\" : \"bert\" , \"model_version\" : null , \"id\" : \"fd206443-f58c-4c5f-a04b-e6babcf6c854\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"paris\" , \"france\" ] } ] }","title":"Fill Mask"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#deploy-the-bert-model-for-fill-mask-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy BERT model for fill mask task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the BERT model for fill mask task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#serve-the-hugging-face-llm-model-using-v1-protocol","text":"First, We will deploy the BERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=google-bert/bert-base-uncased resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using V1 Protocol"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"The capital of France is [MASK].\", \"The capital of [MASK] is paris.\"]}' Expected Output { \"predictions\" :[ \"paris\" , \"france\" ]}","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#serve-the-hugging-face-llm-model-using-open-inference-protocolv2-protocol","text":"First, We will deploy the BERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=bert - --model_id=google-bert/bert-base-uncased resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol)"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#check-inferenceservice-status_1","text":"kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#perform-model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"The capital of France is [MASK].\", \"The capital of [MASK] is paris.\"]}]}' Expected Output { \"model_name\" : \"bert\" , \"model_version\" : null , \"id\" : \"fd206443-f58c-4c5f-a04b-e6babcf6c854\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"paris\" , \"france\" ] } ] }","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/","text":"Integrate KServe LLM Deployment with LLM SDKs \u00b6 This document provides the example of how to integrate KServe LLM Inference Service with the popular LLM SDKs. Deploy a KServe LLM Inference Service \u00b6 Please follow this example: Text Generation using LLama3 to deploy a KServe LLM Inference Service. Get the SERVICE_HOSTNAME by running the following command: SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The model name for the above example is llama3 . How to integrate with OpenAI SDK \u00b6 Install the OpenAI SDK : pip3 install openai Create a Python script to interact with the KServe LLM Inference Service and save it as sample_openai.py : python from openai import OpenAI Deployment_url = \"<SERVICE_HOSTNAME>\" client = OpenAI ( base_url = f \" { Deployment_url } /openai/v1\" , api_key = \"empty\" , ) # typial chat completion response print ( \"Typical chat completion response:\" ) response = client . chat . completions . create ( model = \"llama3\" , messages = [ { 'role' : 'user' , 'content' : \"What's 1+1? Answer in one word.\" } ], temperature = 0 , max_tokens = 256 ) reply = response . choices [ 0 ] . message print ( f \"Extracted reply: \\n { reply . content } \\n \" ) # streaming chat completion response print ( \"Streaming chat completion response:\" ) stream = client . chat . completions . create ( model = 'llama3' , messages = [ { 'role' : 'user' , 'content' : 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...' } ], temperature = 0 , max_tokens = 300 , stream = True # this time, we set stream=True ) for chunk in stream : print ( chunk . choices [ 0 ] . delta . content or \"\" , end = \"\" , flush = True ) Run the python script: python3 sample_openai.py Expected Output Typical chat completion response: Extracted reply: Two. Streaming chat completion response: 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 How to integrate with Langchain SDK \u00b6 Install the Langchain SDK : pip3 install langchain-openai Create a Python script to interact with the KServe LLM Inference Service and save it as sample_langchain.py : python from langchain_openai import ChatOpenAI Deployment_url = \"<SERVICE_HOSTNAME>\" llm = ChatOpenAI ( model_name = \"llama3\" , base_url = f \" { Deployment_url } /openai/v1\" , openai_api_key = \"empty\" , temperature = 0 , max_tokens = 256 , ) # typial chat completion response print ( \"Typical chat completion response:\" ) messages = [ ( \"system\" , \"You are a helpful assistant that translates English to French. Translate the user sentence.\" , ), ( \"human\" , \"I love programming.\" ), ] reply = llm . invoke ( messages ) print ( f \"Extracted reply: \\n { reply . content } \\n \" ) # streaming chat completion response print ( \"Streaming chat completion response:\" ) for chunk in llm . stream ( \"Write me a 1 verse song about goldfish on the moon\" ): print ( chunk . content , end = \"\" , flush = True ) Run the python script: python3 sample_langchain.py Expected Output Typical chat completion response: Extracted reply: Je adore le programmation. Streaming chat completion response: Here is a 1 -verse song about goldfish on the moon: \"In the lunar lake, where the craters shine A school of goldfish swim, in a celestial shrine Their scales glimmer bright, like stars in the night As they dart and play, in the moon's gentle light\"","title":"SDK Integration"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/#integrate-kserve-llm-deployment-with-llm-sdks","text":"This document provides the example of how to integrate KServe LLM Inference Service with the popular LLM SDKs.","title":"Integrate KServe LLM Deployment with LLM SDKs"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/#deploy-a-kserve-llm-inference-service","text":"Please follow this example: Text Generation using LLama3 to deploy a KServe LLM Inference Service. Get the SERVICE_HOSTNAME by running the following command: SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The model name for the above example is llama3 .","title":"Deploy a KServe LLM Inference Service"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/#how-to-integrate-with-openai-sdk","text":"Install the OpenAI SDK : pip3 install openai Create a Python script to interact with the KServe LLM Inference Service and save it as sample_openai.py : python from openai import OpenAI Deployment_url = \"<SERVICE_HOSTNAME>\" client = OpenAI ( base_url = f \" { Deployment_url } /openai/v1\" , api_key = \"empty\" , ) # typial chat completion response print ( \"Typical chat completion response:\" ) response = client . chat . completions . create ( model = \"llama3\" , messages = [ { 'role' : 'user' , 'content' : \"What's 1+1? Answer in one word.\" } ], temperature = 0 , max_tokens = 256 ) reply = response . choices [ 0 ] . message print ( f \"Extracted reply: \\n { reply . content } \\n \" ) # streaming chat completion response print ( \"Streaming chat completion response:\" ) stream = client . chat . completions . create ( model = 'llama3' , messages = [ { 'role' : 'user' , 'content' : 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...' } ], temperature = 0 , max_tokens = 300 , stream = True # this time, we set stream=True ) for chunk in stream : print ( chunk . choices [ 0 ] . delta . content or \"\" , end = \"\" , flush = True ) Run the python script: python3 sample_openai.py Expected Output Typical chat completion response: Extracted reply: Two. Streaming chat completion response: 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100","title":"How to integrate with OpenAI SDK"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/#how-to-integrate-with-langchain-sdk","text":"Install the Langchain SDK : pip3 install langchain-openai Create a Python script to interact with the KServe LLM Inference Service and save it as sample_langchain.py : python from langchain_openai import ChatOpenAI Deployment_url = \"<SERVICE_HOSTNAME>\" llm = ChatOpenAI ( model_name = \"llama3\" , base_url = f \" { Deployment_url } /openai/v1\" , openai_api_key = \"empty\" , temperature = 0 , max_tokens = 256 , ) # typial chat completion response print ( \"Typical chat completion response:\" ) messages = [ ( \"system\" , \"You are a helpful assistant that translates English to French. Translate the user sentence.\" , ), ( \"human\" , \"I love programming.\" ), ] reply = llm . invoke ( messages ) print ( f \"Extracted reply: \\n { reply . content } \\n \" ) # streaming chat completion response print ( \"Streaming chat completion response:\" ) for chunk in llm . stream ( \"Write me a 1 verse song about goldfish on the moon\" ): print ( chunk . content , end = \"\" , flush = True ) Run the python script: python3 sample_langchain.py Expected Output Typical chat completion response: Extracted reply: Je adore le programmation. Streaming chat completion response: Here is a 1 -verse song about goldfish on the moon: \"In the lunar lake, where the craters shine A school of goldfish swim, in a celestial shrine Their scales glimmer bright, like stars in the night As they dart and play, in the moon's gentle light\"","title":"How to integrate with Langchain SDK"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/","text":"Deploy the t5 model for text2text generation task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy t5 model for Text2Text Generation task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using HuggingFace Backend \u00b6 KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster time-to-first-token(TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as paged attention, continuous batching and an optimized CUDA kernel. If the model is not supported by vLLM, KServe falls back to HuggingFace backend as a failsafe. You can use --backend=huggingface argument to perform the inference using Hugging Face API. KServe Hugging Face backend runtime also supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference. Note At the time this document was written, the t5 model is not supported by the vLLM engine, so the runtime will automatically use the Hugging Face backend to serve the model. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-t5 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=t5 - --model_id=google-t5/t5-small - --backend=huggingface resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-t5 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-t5 http://huggingface-t5.default.example.com True 100 huggingface-t5-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-t5 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Sample OpenAI Completions request: \u00b6 curl -H \"content-type:application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -d '{\"model\": \"t5\", \"prompt\": \"translate English to German: The house is wonderful.\", \"stream\":false, \"max_tokens\": 30 }' Expected Output { \"id\" : \"de53f527-9cb9-47a5-9673-43d180b704f2\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Das Haus ist wunderbar.\" } ], \"created\" : 1717998661 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 7 , \"prompt_tokens\" : 11 , \"total_tokens\" : 18 } } Sample OpenAI Completions streaming request: \u00b6 curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -d '{\"model\": \"${MODEL_NAME}\", \"prompt\": \"translate English to German: The house is wonderful.\", \"stream\":true, \"max_tokens\": 30 }' Expected Output da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Das \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Haus \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"ist \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"wunderbar.</s>\" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : [ DONE ]","title":"Text2Text Generation"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#deploy-the-t5-model-for-text2text-generation-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy t5 model for Text2Text Generation task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the t5 model for text2text generation task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#serve-the-hugging-face-llm-model-using-huggingface-backend","text":"KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster time-to-first-token(TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as paged attention, continuous batching and an optimized CUDA kernel. If the model is not supported by vLLM, KServe falls back to HuggingFace backend as a failsafe. You can use --backend=huggingface argument to perform the inference using Hugging Face API. KServe Hugging Face backend runtime also supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference. Note At the time this document was written, the t5 model is not supported by the vLLM engine, so the runtime will automatically use the Hugging Face backend to serve the model. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-t5 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=t5 - --model_id=google-t5/t5-small - --backend=huggingface resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using HuggingFace Backend"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-t5 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-t5 http://huggingface-t5.default.example.com True 100 huggingface-t5-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-t5 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#sample-openai-completions-request","text":"curl -H \"content-type:application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -d '{\"model\": \"t5\", \"prompt\": \"translate English to German: The house is wonderful.\", \"stream\":false, \"max_tokens\": 30 }' Expected Output { \"id\" : \"de53f527-9cb9-47a5-9673-43d180b704f2\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Das Haus ist wunderbar.\" } ], \"created\" : 1717998661 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 7 , \"prompt_tokens\" : 11 , \"total_tokens\" : 18 } }","title":"Sample OpenAI Completions request:"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#sample-openai-completions-streaming-request","text":"curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -d '{\"model\": \"${MODEL_NAME}\", \"prompt\": \"translate English to German: The house is wonderful.\", \"stream\":true, \"max_tokens\": 30 }' Expected Output da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Das \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Haus \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"ist \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"wunderbar.</s>\" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : [ DONE ]","title":"Sample OpenAI Completions streaming request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/","text":"Deploy the BERT model for text classification task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy distilBERT model for sequence classification (a.k.a. text classification) task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using V1 Protocol \u00b6 First, We will deploy the distilBERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-distilbert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=distilbert - --model_id=distilbert/distilbert-base-uncased-finetuned-sst-2-english resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-distilbert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-distilbert http://huggingface-distilbert.default.example.com True 100 huggingface-distilbert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = distilbert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-distilbert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"Hello, my dog is cute\", \"I am feeling sad\"]}' Expected Output { \"predictions\" :[ 1 , 0 ]} Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol) \u00b6 First, We will deploy the distilBERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-distilbert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=distilbert - --model_id=distilbert/distilbert-base-uncased-finetuned-sst-2-english resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-distilbert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-distilbert http://huggingface-distilbert.default.example.com True 100 huggingface-distilbert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = distilbert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-distilbert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"Hello, my dog is cute\", \"I am feeling sad\"]}]}' Expected Output { \"model_name\" : \"distilbert\" , \"model_version\" : null , \"id\" : \"e4bcfc28-e9f2-4c2a-b61f-c491e7346528\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 ], \"datatype\" : \"INT64\" , \"parameters\" : null , \"data\" : [ 1 , 0 ] } ] }","title":"Text Classification"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#deploy-the-bert-model-for-text-classification-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy distilBERT model for sequence classification (a.k.a. text classification) task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the BERT model for text classification task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#serve-the-hugging-face-llm-model-using-v1-protocol","text":"First, We will deploy the distilBERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-distilbert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=distilbert - --model_id=distilbert/distilbert-base-uncased-finetuned-sst-2-english resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using V1 Protocol"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-distilbert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-distilbert http://huggingface-distilbert.default.example.com True 100 huggingface-distilbert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = distilbert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-distilbert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"Hello, my dog is cute\", \"I am feeling sad\"]}' Expected Output { \"predictions\" :[ 1 , 0 ]}","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#serve-the-hugging-face-llm-model-using-open-inference-protocolv2-protocol","text":"First, We will deploy the distilBERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-distilbert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=distilbert - --model_id=distilbert/distilbert-base-uncased-finetuned-sst-2-english resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol)"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#check-inferenceservice-status_1","text":"kubectl get inferenceservices huggingface-distilbert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-distilbert http://huggingface-distilbert.default.example.com True 100 huggingface-distilbert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#perform-model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = distilbert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-distilbert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"Hello, my dog is cute\", \"I am feeling sad\"]}]}' Expected Output { \"model_name\" : \"distilbert\" , \"model_version\" : null , \"id\" : \"e4bcfc28-e9f2-4c2a-b61f-c491e7346528\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 ], \"datatype\" : \"INT64\" , \"parameters\" : null , \"data\" : [ 1 , 0 ] } ] }","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/","text":"Deploy the Llama3 model for text_generation task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy Llama3 model for text generation task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using vLLM backend \u00b6 KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster time-to-first-token(TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as paged attention, continuous batching and an optimized CUDA kernel. If the model is not supported by vLLM, KServe falls back to HuggingFace backend as a failsafe. Note The Llama3 model requires huggingface hub token to download the model. You can set the token using HF_TOKEN environment variable. Create a secret with the Hugging Face token. Yaml apiVersion : v1 kind : Secret metadata : name : hf-secret type : Opaque stringData : HF_TOKEN : <token> Then create the inference service. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct env : - name : HF_TOKEN valueFrom : secretKeyRef : name : hf-secret key : HF_TOKEN optional : false resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-llama3 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-llama3 http://huggingface-llama3.default.example.com True 100 huggingface-llama3-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = llama3 SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) KServe Hugging Face vLLM runtime supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference Sample OpenAI Completions request: \u00b6 curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":false, \"max_tokens\": 30}' Expected Output { \"id\" : \"cmpl-625a9240f25e463487a9b6c53cbed080\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \" and how they make you feel\\nColors, oh colors, so vibrant and bright\\nA world of emotions, a kaleidoscope in sight\\nRed\" } ], \"created\" : 1718620153 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 6 , \"total_tokens\" : 36 } } Sample OpenAI Chat request: \u00b6 curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":false}' Expected Output { \"id\" : \"cmpl-9aad539128294069bf1e406a5cba03d3\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"message\" : { \"content\" : \" O, fair and vibrant colors, how ye doth delight\\nIn the world around us, with thy hues so bright!\\n\" , \"tool_calls\" : null , \"role\" : \"assistant\" , \"function_call\" : null }, \"logprobs\" : null } ], \"created\" : 1718638005 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 37 , \"total_tokens\" : 67 } } Sample OpenAI Chat Completions streaming request: \u00b6 curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":true}' Note The output is truncated for brevity. Expected Output da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" \" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" O\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \",\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \"skie\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \",\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" what\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : [ DONE ] Serve the Hugging Face LLM model using HuggingFace Backend \u00b6 You can use --backend=huggingface argument to perform the inference using Hugging Face API. KServe Hugging Face backend runtime also supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference. Note The Llama3 model requires huggingface hub token to download the model. You can set the token using HF_TOKEN environment variable. Create a secret with the Hugging Face token. Yaml apiVersion : v1 kind : Secret metadata : name : hf-secret type : Opaque stringData : HF_TOKEN : <token> Then create the inference service. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct - --backend=huggingface env : - name : HF_TOKEN valueFrom : secretKeyRef : name : hf-secret key : HF_TOKEN optional : false resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-llama3 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-llama3 http://huggingface-llama3.default.example.com True 100 huggingface-llama3-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = llama3 SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Sample OpenAI Completions request: \u00b6 curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":false, \"max_tokens\": 30}' Expected Output { \"id\" : \"564d3bcf-5569-4d15-ace4-ed8a29678359\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"\\nColors, oh colors, so vibrant and bright\\nA world of emotions, a world in sight\\nRed, the passion, the fire that burns\" } ], \"created\" : 1718699758 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 6 , \"total_tokens\" : 36 } } Sample OpenAI Chat Completions request: \u00b6 curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":false}' Expected Output { \"id\" : \"7dcc83b4-aa94-4a52-90fd-fa705978d3c1\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"message\" : { \"content\" : \"assistant\\n\\nO, fairest hues of earth and sky,\\nHow oft thy beauty doth my senses fly!\\nIn vibrant splendor, thou\" , \"tool_calls\" : null , \"role\" : \"assistant\" , \"function_call\" : null }, \"logprobs\" : null } ], \"created\" : 1718699982 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 26 , \"total_tokens\" : 56 } } Sample OpenAI Completions streaming request: \u00b6 curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":true, \"max_tokens\": 30}' Note The output is truncated for brevity. Expected Output da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"\\n\" }], \"created\" : 1718700166 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Colors, \" }], \"created\" : 1718700168 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"oh \" }], \"created\" : 1718700169 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"colors, \" }], \"created\" : 1718700170 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : [ DONE ]","title":"Text Generation"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#deploy-the-llama3-model-for-text_generation-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy Llama3 model for text generation task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the Llama3 model for text_generation task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#serve-the-hugging-face-llm-model-using-vllm-backend","text":"KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster time-to-first-token(TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as paged attention, continuous batching and an optimized CUDA kernel. If the model is not supported by vLLM, KServe falls back to HuggingFace backend as a failsafe. Note The Llama3 model requires huggingface hub token to download the model. You can set the token using HF_TOKEN environment variable. Create a secret with the Hugging Face token. Yaml apiVersion : v1 kind : Secret metadata : name : hf-secret type : Opaque stringData : HF_TOKEN : <token> Then create the inference service. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct env : - name : HF_TOKEN valueFrom : secretKeyRef : name : hf-secret key : HF_TOKEN optional : false resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using vLLM backend"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-llama3 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-llama3 http://huggingface-llama3.default.example.com True 100 huggingface-llama3-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = llama3 SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) KServe Hugging Face vLLM runtime supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-completions-request","text":"curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":false, \"max_tokens\": 30}' Expected Output { \"id\" : \"cmpl-625a9240f25e463487a9b6c53cbed080\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \" and how they make you feel\\nColors, oh colors, so vibrant and bright\\nA world of emotions, a kaleidoscope in sight\\nRed\" } ], \"created\" : 1718620153 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 6 , \"total_tokens\" : 36 } }","title":"Sample OpenAI Completions request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-chat-request","text":"curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":false}' Expected Output { \"id\" : \"cmpl-9aad539128294069bf1e406a5cba03d3\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"message\" : { \"content\" : \" O, fair and vibrant colors, how ye doth delight\\nIn the world around us, with thy hues so bright!\\n\" , \"tool_calls\" : null , \"role\" : \"assistant\" , \"function_call\" : null }, \"logprobs\" : null } ], \"created\" : 1718638005 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 37 , \"total_tokens\" : 67 } }","title":"Sample OpenAI Chat request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-chat-completions-streaming-request","text":"curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":true}' Note The output is truncated for brevity. Expected Output da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" \" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" O\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \",\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \"skie\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \",\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" what\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : [ DONE ]","title":"Sample OpenAI Chat Completions streaming request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#serve-the-hugging-face-llm-model-using-huggingface-backend","text":"You can use --backend=huggingface argument to perform the inference using Hugging Face API. KServe Hugging Face backend runtime also supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference. Note The Llama3 model requires huggingface hub token to download the model. You can set the token using HF_TOKEN environment variable. Create a secret with the Hugging Face token. Yaml apiVersion : v1 kind : Secret metadata : name : hf-secret type : Opaque stringData : HF_TOKEN : <token> Then create the inference service. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct - --backend=huggingface env : - name : HF_TOKEN valueFrom : secretKeyRef : name : hf-secret key : HF_TOKEN optional : false resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using HuggingFace Backend"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#check-inferenceservice-status_1","text":"kubectl get inferenceservices huggingface-llama3 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-llama3 http://huggingface-llama3.default.example.com True 100 huggingface-llama3-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#perform-model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = llama3 SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-completions-request_1","text":"curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":false, \"max_tokens\": 30}' Expected Output { \"id\" : \"564d3bcf-5569-4d15-ace4-ed8a29678359\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"\\nColors, oh colors, so vibrant and bright\\nA world of emotions, a world in sight\\nRed, the passion, the fire that burns\" } ], \"created\" : 1718699758 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 6 , \"total_tokens\" : 36 } }","title":"Sample OpenAI Completions request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-chat-completions-request","text":"curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":false}' Expected Output { \"id\" : \"7dcc83b4-aa94-4a52-90fd-fa705978d3c1\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"message\" : { \"content\" : \"assistant\\n\\nO, fairest hues of earth and sky,\\nHow oft thy beauty doth my senses fly!\\nIn vibrant splendor, thou\" , \"tool_calls\" : null , \"role\" : \"assistant\" , \"function_call\" : null }, \"logprobs\" : null } ], \"created\" : 1718699982 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 26 , \"total_tokens\" : 56 } }","title":"Sample OpenAI Chat Completions request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-completions-streaming-request","text":"curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":true, \"max_tokens\": 30}' Note The output is truncated for brevity. Expected Output da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"\\n\" }], \"created\" : 1718700166 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Colors, \" }], \"created\" : 1718700168 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"oh \" }], \"created\" : 1718700169 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"colors, \" }], \"created\" : 1718700170 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : [ DONE ]","title":"Sample OpenAI Completions streaming request:"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/","text":"Deploy the BERT model for token classification task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy BERT model for token classification task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using V1 Protocol \u00b6 First, We will deploy the BERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=dslim/bert-base-NER - --disable_lower_case resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"My name is Wolfgang and I live in Berlin\", \"My name is Lisa and I live in Paris\"]}' Expected Output { \"predictions\" :[[[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ]],[[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ]]]} Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol) \u00b6 First, We will deploy the BERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=bert - --model_id=dslim/bert-base-NER - --disable_lower_case resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"My name is Wolfgang and I live in Berlin\", \"My name is Lisa and I live in Paris\"]}]}' Expected Output { \"model_name\" : \"bert\" , \"model_version\" : null , \"id\" : \"3117e54b-8a6a-4072-9d87-6d7bdfe05eed\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 , 1 , 11 ], \"datatype\" : \"INT64\" , \"parameters\" : null , \"data\" :[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ] } ] }","title":"Token Classification"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#deploy-the-bert-model-for-token-classification-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy BERT model for token classification task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the BERT model for token classification task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#serve-the-hugging-face-llm-model-using-v1-protocol","text":"First, We will deploy the BERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=dslim/bert-base-NER - --disable_lower_case resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using V1 Protocol"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"My name is Wolfgang and I live in Berlin\", \"My name is Lisa and I live in Paris\"]}' Expected Output { \"predictions\" :[[[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ]],[[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ]]]}","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#serve-the-hugging-face-llm-model-using-open-inference-protocolv2-protocol","text":"First, We will deploy the BERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=bert - --model_id=dslim/bert-base-NER - --disable_lower_case resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol)"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#check-inferenceservice-status_1","text":"kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#perform-model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"My name is Wolfgang and I live in Berlin\", \"My name is Lisa and I live in Paris\"]}]}' Expected Output { \"model_name\" : \"bert\" , \"model_version\" : null , \"id\" : \"3117e54b-8a6a-4072-9d87-6d7bdfe05eed\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 , 1 , 11 ], \"datatype\" : \"INT64\" , \"parameters\" : null , \"data\" :[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ] } ] }","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/","text":"Serve Large Language Model with Huggingface Accelerate \u00b6 This documentation explains how KServe supports large language model serving via TorchServe . The large language refers to the models that are not able to fit into a single GPU, and they need to be sharded onto multiple partitions over multiple GPUs. Huggingface Accelerate can load sharded checkpoints and the maximum RAM usage will be the size of the largest shard. By setting device_map to true, Accelerate automatically determines where to put each layer of the model depending on the available resources. Package the model \u00b6 Download the model bigscience/bloom-7b1 from Huggingface Hub by running python Download_model.py --model_name bigscience/bloom-7b1 Compress the model zip -r model.zip model/models--bigscience-bloom-7b1/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/ Package the model Create the setup_config.json file with accelerate settings: Enable low_cpu_mem_usage to use accelerate Recommended max_memory in setup_config.json is the max size of shard. { \"revision\" : \"main\" , \"max_memory\" : { \"0\" : \"10GB\" , \"cpu\" : \"10GB\" }, \"low_cpu_mem_usage\" : true , \"device_map\" : \"auto\" , \"offload_folder\" : \"offload\" , \"offload_state_dict\" : true , \"torch_dtype\" : \"float16\" , \"max_length\" : \"80\" } torch-model-archiver --model-name bloom7b1 --version 1 .0 --handler custom_handler.py --extra-files model.zip,setup_config.json Upload to your cloud storage, or you can use the uploaded bloom model from KServe GCS bucket. Serve the large language model with InferenceService \u00b6 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"bloom7b1\" spec : predictor : pytorch : runtimeVersion : 0.8.2 storageUri : gs://kfserving-examples/models/torchserve/llm/Huggingface_accelerate/bloom resources : limits : cpu : \"2\" memory : 32Gi nvidia.com/gpu : \"2\" requests : cpu : \"2\" memory : 32Gi nvidia.com/gpu : \"2\" Run the Inference \u00b6 Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice bloom7b1 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./text.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/bloom7b1:predict { \"predictions\" : [ \"My dog is cute.\\nNice.\\n- Hey, Mom.\\n- Yeah?\\nWhat color's your dog?\\n- It's gray.\\n- Gray?\\nYeah.\\nIt looks gray to me.\\n- Where'd you get it?\\n- Well, Dad says it's kind of...\\n- Gray?\\n- Gray.\\nYou got a gray dog?\\n- It's gray.\\n- Gray.\\nIs your dog gray?\\nAre you sure?\\nNo.\\nYou sure\" ]}","title":"TorchServe LLM"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/#serve-large-language-model-with-huggingface-accelerate","text":"This documentation explains how KServe supports large language model serving via TorchServe . The large language refers to the models that are not able to fit into a single GPU, and they need to be sharded onto multiple partitions over multiple GPUs. Huggingface Accelerate can load sharded checkpoints and the maximum RAM usage will be the size of the largest shard. By setting device_map to true, Accelerate automatically determines where to put each layer of the model depending on the available resources.","title":"Serve Large Language Model with Huggingface Accelerate"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/#package-the-model","text":"Download the model bigscience/bloom-7b1 from Huggingface Hub by running python Download_model.py --model_name bigscience/bloom-7b1 Compress the model zip -r model.zip model/models--bigscience-bloom-7b1/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/ Package the model Create the setup_config.json file with accelerate settings: Enable low_cpu_mem_usage to use accelerate Recommended max_memory in setup_config.json is the max size of shard. { \"revision\" : \"main\" , \"max_memory\" : { \"0\" : \"10GB\" , \"cpu\" : \"10GB\" }, \"low_cpu_mem_usage\" : true , \"device_map\" : \"auto\" , \"offload_folder\" : \"offload\" , \"offload_state_dict\" : true , \"torch_dtype\" : \"float16\" , \"max_length\" : \"80\" } torch-model-archiver --model-name bloom7b1 --version 1 .0 --handler custom_handler.py --extra-files model.zip,setup_config.json Upload to your cloud storage, or you can use the uploaded bloom model from KServe GCS bucket.","title":"Package the model"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/#serve-the-large-language-model-with-inferenceservice","text":"apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"bloom7b1\" spec : predictor : pytorch : runtimeVersion : 0.8.2 storageUri : gs://kfserving-examples/models/torchserve/llm/Huggingface_accelerate/bloom resources : limits : cpu : \"2\" memory : 32Gi nvidia.com/gpu : \"2\" requests : cpu : \"2\" memory : 32Gi nvidia.com/gpu : \"2\"","title":"Serve the large language model with InferenceService"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/#run-the-inference","text":"Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice bloom7b1 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./text.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/bloom7b1:predict { \"predictions\" : [ \"My dog is cute.\\nNice.\\n- Hey, Mom.\\n- Yeah?\\nWhat color's your dog?\\n- It's gray.\\n- Gray?\\nYeah.\\nIt looks gray to me.\\n- Where'd you get it?\\n- Well, Dad says it's kind of...\\n- Gray?\\n- Gray.\\nYou got a gray dog?\\n- It's gray.\\n- Gray.\\nIs your dog gray?\\nAre you sure?\\nNo.\\nYou sure\" ]}","title":"Run the Inference"},{"location":"modelserving/v1beta1/llm/vllm/","text":"vLLM Runtime \u00b6 The official vLLM support is available through Hugging Face Serving Runtime .","title":"Index"},{"location":"modelserving/v1beta1/llm/vllm/#vllm-runtime","text":"The official vLLM support is available through Hugging Face Serving Runtime .","title":"vLLM Runtime"},{"location":"modelserving/v1beta1/mlflow/v2/","text":"Deploy MLflow models with InferenceService \u00b6 This example walks you through how to deploy a mlflow model leveraging the KServe InferenceService CRD and how to send the inference request using V2 Dataplane . Training \u00b6 The first step is to train a sample sklearn model and save as mlflow model format by calling mlflow log_model API. # Original source code and more details can be found in: # https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html # The data set used in this example is from # http://archive.ics.uci.edu/ml/datasets/Wine+Quality # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. # Modeling wine preferences by data mining from physicochemical properties. # In Decision Support Systems, Elsevier, 47(4):547-553, 2009. import warnings import sys import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score from sklearn.model_selection import train_test_split from sklearn.linear_model import ElasticNet from urllib.parse import urlparse import mlflow import mlflow.sklearn from mlflow.models.signature import infer_signature import logging logging . basicConfig ( level = logging . WARN ) logger = logging . getLogger ( __name__ ) def eval_metrics ( actual , pred ): rmse = np . sqrt ( mean_squared_error ( actual , pred )) mae = mean_absolute_error ( actual , pred ) r2 = r2_score ( actual , pred ) return rmse , mae , r2 if __name__ == \"__main__\" : warnings . filterwarnings ( \"ignore\" ) np . random . seed ( 40 ) # Read the wine-quality csv file from the URL csv_url = ( \"http://archive.ics.uci.edu/ml\" \"/machine-learning-databases/wine-quality/winequality-red.csv\" ) try : data = pd . read_csv ( csv_url , sep = \";\" ) except Exception as e : logger . exception ( \"Unable to download training & test CSV, \" \"check your internet connection. Error: %s \" , e , ) # Split the data into training and test sets. (0.75, 0.25) split. train , test = train_test_split ( data ) # The predicted column is \"quality\" which is a scalar from [3, 9] train_x = train . drop ([ \"quality\" ], axis = 1 ) test_x = test . drop ([ \"quality\" ], axis = 1 ) train_y = train [[ \"quality\" ]] test_y = test [[ \"quality\" ]] alpha = float ( sys . argv [ 1 ]) if len ( sys . argv ) > 1 else 0.5 l1_ratio = float ( sys . argv [ 2 ]) if len ( sys . argv ) > 2 else 0.5 with mlflow . start_run (): lr = ElasticNet ( alpha = alpha , l1_ratio = l1_ratio , random_state = 42 ) lr . fit ( train_x , train_y ) predicted_qualities = lr . predict ( test_x ) ( rmse , mae , r2 ) = eval_metrics ( test_y , predicted_qualities ) print ( \"Elasticnet model (alpha= %f , l1_ratio= %f ):\" % ( alpha , l1_ratio )) print ( \" RMSE: %s \" % rmse ) print ( \" MAE: %s \" % mae ) print ( \" R2: %s \" % r2 ) mlflow . log_param ( \"alpha\" , alpha ) mlflow . log_param ( \"l1_ratio\" , l1_ratio ) mlflow . log_metric ( \"rmse\" , rmse ) mlflow . log_metric ( \"r2\" , r2 ) mlflow . log_metric ( \"mae\" , mae ) tracking_url_type_store = urlparse ( mlflow . get_tracking_uri ()) . scheme model_signature = infer_signature ( train_x , train_y ) # Model registry does not work with file store if tracking_url_type_store != \"file\" : # Register the model # There are other ways to use the Model Registry, # which depends on the use case, # please refer to the doc for more information: # https://mlflow.org/docs/latest/model-registry.html#api-workflow mlflow . sklearn . log_model ( lr , \"model\" , registered_model_name = \"ElasticnetWineModel\" , signature = model_signature , ) else : mlflow . sklearn . log_model ( lr , \"model\" , signature = model_signature ) The training script will also serialise our trained model, leveraging the MLflow Model format. model/ \u251c\u2500\u2500 MLmodel \u251c\u2500\u2500 model.pkl \u251c\u2500\u2500 conda.yaml \u2514\u2500\u2500 requirements.txt Testing locally \u00b6 Once you've got your model serialised model.pkl , we can then use MLServer to spin up a local server. For more details on MLServer, feel free to check the MLflow example doc . Note this step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use MLServer locally, you will first need to install the mlserver package in your local environment, as well as the MLflow runtime. pip install mlserver mlserver-mlflow Model settings \u00b6 The next step will be providing some model settings so that MLServer knows: The inference runtime to serve your model (i.e. mlserver_mlflow.MLflowRuntime ) The model's name and version These can be specified through environment variables or by creating a local model-settings.json file: { \"name\" : \"mlflow-wine-classifier\" , \"version\" : \"v1.0.0\" , \"implementation\" : \"mlserver_mlflow.MLflowRuntime\" } Start the model server locally \u00b6 With the mlserver package installed locally and a local model-settings.json file, you should now be ready to start our server as: mlserver start . Deploy with InferenceService \u00b6 When you deploy the model with InferenceService, KServe injects sensible defaults so that it runs out-of-the-box without any further configuration. However, you can still override these defaults by providing a model-settings.json file similar to your local one. You can even provide a set of model-settings.json files to load multiple models . To use v2 protocol for inference with the deployed model you set the protocolVersion field to v2 , in this example your model artifacts have already been uploaded to a \"GCS model repository\" and can be accessed as gs://kfserving-examples/models/mlflow/wine . New Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mlflow-v2-wine-classifier\" spec : predictor : model : modelFormat : name : mlflow protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/mlflow/wine\" kubectl kubectl apply -f mlflow-new.yaml Testing deployed model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below: { \"parameters\" : { \"content_type\" : \"pd\" }, \"inputs\" : [ { \"name\" : \"fixed acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 7.4 ] }, { \"name\" : \"volatile acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.7000 ] }, { \"name\" : \"citric acid\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0 ] }, { \"name\" : \"residual sugar\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 1.9 ] }, { \"name\" : \"chlorides\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.076 ] }, { \"name\" : \"free sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 11 ] }, { \"name\" : \"total sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 34 ] }, { \"name\" : \"density\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.9978 ] }, { \"name\" : \"pH\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 3.51 ] }, { \"name\" : \"sulphates\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.56 ] }, { \"name\" : \"alcohol\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 9.4 ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice mlflow-v2-wine-classifier -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./mlflow-input.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mlflow-v2-wine-classifier/infer Expected Output { \"model_name\" : \"mlflow-v2-wine-classifier\" , \"model_version\" : null , \"id\" : \"699cf11c-e843-444e-9dc3-000d991052cc\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 1 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" :[ 5.576883936610762 ] } ] }","title":"MLFlow"},{"location":"modelserving/v1beta1/mlflow/v2/#deploy-mlflow-models-with-inferenceservice","text":"This example walks you through how to deploy a mlflow model leveraging the KServe InferenceService CRD and how to send the inference request using V2 Dataplane .","title":"Deploy MLflow models with InferenceService"},{"location":"modelserving/v1beta1/mlflow/v2/#training","text":"The first step is to train a sample sklearn model and save as mlflow model format by calling mlflow log_model API. # Original source code and more details can be found in: # https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html # The data set used in this example is from # http://archive.ics.uci.edu/ml/datasets/Wine+Quality # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. # Modeling wine preferences by data mining from physicochemical properties. # In Decision Support Systems, Elsevier, 47(4):547-553, 2009. import warnings import sys import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score from sklearn.model_selection import train_test_split from sklearn.linear_model import ElasticNet from urllib.parse import urlparse import mlflow import mlflow.sklearn from mlflow.models.signature import infer_signature import logging logging . basicConfig ( level = logging . WARN ) logger = logging . getLogger ( __name__ ) def eval_metrics ( actual , pred ): rmse = np . sqrt ( mean_squared_error ( actual , pred )) mae = mean_absolute_error ( actual , pred ) r2 = r2_score ( actual , pred ) return rmse , mae , r2 if __name__ == \"__main__\" : warnings . filterwarnings ( \"ignore\" ) np . random . seed ( 40 ) # Read the wine-quality csv file from the URL csv_url = ( \"http://archive.ics.uci.edu/ml\" \"/machine-learning-databases/wine-quality/winequality-red.csv\" ) try : data = pd . read_csv ( csv_url , sep = \";\" ) except Exception as e : logger . exception ( \"Unable to download training & test CSV, \" \"check your internet connection. Error: %s \" , e , ) # Split the data into training and test sets. (0.75, 0.25) split. train , test = train_test_split ( data ) # The predicted column is \"quality\" which is a scalar from [3, 9] train_x = train . drop ([ \"quality\" ], axis = 1 ) test_x = test . drop ([ \"quality\" ], axis = 1 ) train_y = train [[ \"quality\" ]] test_y = test [[ \"quality\" ]] alpha = float ( sys . argv [ 1 ]) if len ( sys . argv ) > 1 else 0.5 l1_ratio = float ( sys . argv [ 2 ]) if len ( sys . argv ) > 2 else 0.5 with mlflow . start_run (): lr = ElasticNet ( alpha = alpha , l1_ratio = l1_ratio , random_state = 42 ) lr . fit ( train_x , train_y ) predicted_qualities = lr . predict ( test_x ) ( rmse , mae , r2 ) = eval_metrics ( test_y , predicted_qualities ) print ( \"Elasticnet model (alpha= %f , l1_ratio= %f ):\" % ( alpha , l1_ratio )) print ( \" RMSE: %s \" % rmse ) print ( \" MAE: %s \" % mae ) print ( \" R2: %s \" % r2 ) mlflow . log_param ( \"alpha\" , alpha ) mlflow . log_param ( \"l1_ratio\" , l1_ratio ) mlflow . log_metric ( \"rmse\" , rmse ) mlflow . log_metric ( \"r2\" , r2 ) mlflow . log_metric ( \"mae\" , mae ) tracking_url_type_store = urlparse ( mlflow . get_tracking_uri ()) . scheme model_signature = infer_signature ( train_x , train_y ) # Model registry does not work with file store if tracking_url_type_store != \"file\" : # Register the model # There are other ways to use the Model Registry, # which depends on the use case, # please refer to the doc for more information: # https://mlflow.org/docs/latest/model-registry.html#api-workflow mlflow . sklearn . log_model ( lr , \"model\" , registered_model_name = \"ElasticnetWineModel\" , signature = model_signature , ) else : mlflow . sklearn . log_model ( lr , \"model\" , signature = model_signature ) The training script will also serialise our trained model, leveraging the MLflow Model format. model/ \u251c\u2500\u2500 MLmodel \u251c\u2500\u2500 model.pkl \u251c\u2500\u2500 conda.yaml \u2514\u2500\u2500 requirements.txt","title":"Training"},{"location":"modelserving/v1beta1/mlflow/v2/#testing-locally","text":"Once you've got your model serialised model.pkl , we can then use MLServer to spin up a local server. For more details on MLServer, feel free to check the MLflow example doc . Note this step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Testing locally"},{"location":"modelserving/v1beta1/mlflow/v2/#pre-requisites","text":"Firstly, to use MLServer locally, you will first need to install the mlserver package in your local environment, as well as the MLflow runtime. pip install mlserver mlserver-mlflow","title":"Pre-requisites"},{"location":"modelserving/v1beta1/mlflow/v2/#model-settings","text":"The next step will be providing some model settings so that MLServer knows: The inference runtime to serve your model (i.e. mlserver_mlflow.MLflowRuntime ) The model's name and version These can be specified through environment variables or by creating a local model-settings.json file: { \"name\" : \"mlflow-wine-classifier\" , \"version\" : \"v1.0.0\" , \"implementation\" : \"mlserver_mlflow.MLflowRuntime\" }","title":"Model settings"},{"location":"modelserving/v1beta1/mlflow/v2/#start-the-model-server-locally","text":"With the mlserver package installed locally and a local model-settings.json file, you should now be ready to start our server as: mlserver start .","title":"Start the model server locally"},{"location":"modelserving/v1beta1/mlflow/v2/#deploy-with-inferenceservice","text":"When you deploy the model with InferenceService, KServe injects sensible defaults so that it runs out-of-the-box without any further configuration. However, you can still override these defaults by providing a model-settings.json file similar to your local one. You can even provide a set of model-settings.json files to load multiple models . To use v2 protocol for inference with the deployed model you set the protocolVersion field to v2 , in this example your model artifacts have already been uploaded to a \"GCS model repository\" and can be accessed as gs://kfserving-examples/models/mlflow/wine . New Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mlflow-v2-wine-classifier\" spec : predictor : model : modelFormat : name : mlflow protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/mlflow/wine\" kubectl kubectl apply -f mlflow-new.yaml","title":"Deploy with InferenceService"},{"location":"modelserving/v1beta1/mlflow/v2/#testing-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below: { \"parameters\" : { \"content_type\" : \"pd\" }, \"inputs\" : [ { \"name\" : \"fixed acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 7.4 ] }, { \"name\" : \"volatile acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.7000 ] }, { \"name\" : \"citric acid\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0 ] }, { \"name\" : \"residual sugar\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 1.9 ] }, { \"name\" : \"chlorides\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.076 ] }, { \"name\" : \"free sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 11 ] }, { \"name\" : \"total sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 34 ] }, { \"name\" : \"density\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.9978 ] }, { \"name\" : \"pH\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 3.51 ] }, { \"name\" : \"sulphates\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.56 ] }, { \"name\" : \"alcohol\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 9.4 ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice mlflow-v2-wine-classifier -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./mlflow-input.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mlflow-v2-wine-classifier/infer Expected Output { \"model_name\" : \"mlflow-v2-wine-classifier\" , \"model_version\" : null , \"id\" : \"699cf11c-e843-444e-9dc3-000d991052cc\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 1 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" :[ 5.576883936610762 ] } ] }","title":"Testing deployed model"},{"location":"modelserving/v1beta1/onnx/","text":"Deploy InferenceService with ONNX model \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : model : protocolVersion : v2 modelFormat : name : onnx storageUri : \"gs://kfserving-examples/models/onnx\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : onnx : storageUri : \"gs://kfserving-examples/models/onnx\" Note For the default kserve installation, While using new schema, you must specify protocolVersion as v2 for onnx. Otherwise, you will get a no runtime found error. Expected Output $ inferenceservice.serving.kserve.io/style-sample configured Run a sample inference \u00b6 Setup env vars The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT export ISVC_NAME=style-sample export SERVICE_HOSTNAME=$(kubectl get inferenceservice ${ISVC_NAME} -o jsonpath='{.status.url}' | cut -d \"/\" -f 3) 2. Verify the service is healthy curl -v -H \"Host:${SERVICE_HOSTNAME}\" http://localhost:8080//v2/health/ready 3. Install dependencies pip install -r requirements.txt 4. Run the sample notebook in jupyter jupyter notebook Uploading your own model \u00b6 The sample model for the example in this readme is already uploaded and available for use. However if you would like to modify the example to use your own ONNX model, all you need to do is to upload your model as model.onnx to S3, GCS or an Azure Blob.","title":"ONNX"},{"location":"modelserving/v1beta1/onnx/#deploy-inferenceservice-with-onnx-model","text":"","title":"Deploy InferenceService with ONNX model"},{"location":"modelserving/v1beta1/onnx/#setup","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible .","title":"Setup"},{"location":"modelserving/v1beta1/onnx/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : model : protocolVersion : v2 modelFormat : name : onnx storageUri : \"gs://kfserving-examples/models/onnx\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : onnx : storageUri : \"gs://kfserving-examples/models/onnx\" Note For the default kserve installation, While using new schema, you must specify protocolVersion as v2 for onnx. Otherwise, you will get a no runtime found error. Expected Output $ inferenceservice.serving.kserve.io/style-sample configured","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/onnx/#run-a-sample-inference","text":"Setup env vars The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT export ISVC_NAME=style-sample export SERVICE_HOSTNAME=$(kubectl get inferenceservice ${ISVC_NAME} -o jsonpath='{.status.url}' | cut -d \"/\" -f 3) 2. Verify the service is healthy curl -v -H \"Host:${SERVICE_HOSTNAME}\" http://localhost:8080//v2/health/ready 3. Install dependencies pip install -r requirements.txt 4. Run the sample notebook in jupyter jupyter notebook","title":"Run a sample inference"},{"location":"modelserving/v1beta1/onnx/#uploading-your-own-model","text":"The sample model for the example in this readme is already uploaded and available for use. However if you would like to modify the example to use your own ONNX model, all you need to do is to upload your model as model.onnx to S3, GCS or an Azure Blob.","title":"Uploading your own model"},{"location":"modelserving/v1beta1/paddle/","text":"Deploy Paddle model with InferenceService \u00b6 In this example, we use a trained paddle resnet50 model to classify images by running an inference service with Paddle predictor. Deploy Paddle model with V1 protocol \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : model : modelFormat : name : paddle storageUri : \"gs://kfserving-examples/models/paddle/resnet\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : paddle : storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the above yaml to create the InferenceService kubectl apply -f paddle.yaml Expected Output $ inferenceservice.serving.kserve.io/paddle-resnet50 created Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload jay.json as the sample input to test the model. MODEL_NAME = paddle-resnet50 SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Expected Output * Trying 127 .0.0.1:80... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 80 ( #0) > POST /v1/models/paddle-resnet50:predict HTTP/1.1 > Host: paddle-resnet50.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3010209 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23399 < content-type: application/json ; charset = UTF-8 < date: Mon, 17 May 2021 03 :34:58 GMT < server: istio-envoy < x-envoy-upstream-service-time: 511 < { \"predictions\" : [[ 6 .736678770380422e-09, 1 .1535990829258935e-08, 5 .142250714129659e-08, 6 .647170636142619e-08, 4 .094492567219277e-08, 1 .3402451770616608e-07, 9 .355561303436843e-08, 2 .8935891904779965e-08, 6 .845367295227334e-08, 7 .680615965455218e-08, 2 .0334689452283783e-06, 1 .1085678579547675e-06, 2 .3477592492326949e-07, 6 .582037030966603e-07, 0 .00012373103527352214, 4 .2878804151769145e-07, 6 .419959845516132e-06, 0 .9993496537208557, 7 .372002437477931e-05, 3 .101135735050775e-05, 5 .6028093240456656e-06, 2 .1862508674530545e-06, 1 .9544044604913324e-08, 3 .728893887000595e-07, 4 .2903633357127546e-07, 1 .8251179767503345e-07, 7 .159925985433802e-08, 9 .231618136595898e-09, 6 .469241498052725e-07, 7 .031690341108288e-09, 4 .451231561120039e-08, 1 .2455971898361895e-07, 9 .44632745358831e-08, 4 .347704418705689e-08, 4 .658220120745682e-07, 6 .797721141538204e-08, 2 .1060276367279585e-07, 2 .2605123106700376e-08, 1 .4311490303953178e-07, 7 .951298641728499e-08, 1 .2341783417468832e-07, 1 .0921713737843675e-06, 1 .5243892448779661e-05, 3 .1173343018053856e-07, 2 .4152058131221565e-07, 6 .863762536113427e-08, 8 .467682022228473e-08, 9 .4246772164297e-08, 1 .0219210366813058e-08, 3 .3770753304906975e-08, 3 .6928835100979995e-08, 1 .3694031508748594e-07, 1 .0674284567357972e-07, 2 .599483650556067e-07, 3 .4866405940192635e-07, 3 .132053549848024e-08, 3 .574873232992104e-07, 6 .64843895492595e-08, 3 .1638955988455564e-07, 1 .2095878219042788e-06, 8 .66409024524728e-08, 4 .0144172430700564e-08, 1 .2544761318622477e-07, 3 .3201178695208e-08, 1 .9731444922399533e-07, 3 .806405572959193e-07, 1 .3827865075199952e-07, 2 .300225965257141e-08, 7 .14422512260171e-08, 2 .851114544455413e-08, 2 .982567437470607e-08, 8 .936032713791064e-08, 6 .22388370175031e-07, 6 .478838798784636e-08, 1 .3663023423760023e-07, 9 .973181391842445e-08, 2 .5761554667269593e-08, 4 .130220077058766e-08, 3 .9384463690339544e-08, 1 .2158079698565416e-07, 4 .302821707824478e-06, 1 .8179063090428826e-06, 1 .8520155435908237e-06, 1 .6246107179540559e-06, 1 .6448313544970006e-05, 1 .0544916221988387e-05, 3 .993061909568496e-06, 2 .646479799750523e-07, 1 .9193475964129902e-05, 4 .803242745765601e-07, 1 .696285067964709e-07, 4 .550505764200352e-06, 4 .235929372953251e-05, 4 .443338639248395e-06, 5 .104009687784128e-06, 1 .3506396498996764e-05, 4 .1758724478313525e-07, 4 .494491463447048e-07, 3 .156698369366495e-07, 1 .0557599807725637e-06, 1 .336463917311903e-08, 1 .3893659556174498e-08, 6 .770379457066156e-08, 1 .4129696523923485e-07, 7 .170518756538513e-08, 7 .934466594861078e-08, 2 .639154317307657e-08, 2 .6134321373660896e-08, 7 .196725881897237e-09, 2 .1752363466021052e-08, 6 .684639686227456e-08, 3 .417795824134373e-08, 1 .6228275967478112e-07, 4 .107114648377319e-07, 6 .472135396506928e-07, 2 .951379372007068e-07, 5 .653474133282543e-09, 4 .830144462175667e-08, 8 .887481861563629e-09, 3 .7306168820805397e-08, 1 .7784264727538357e-08, 4 .641905082536368e-09, 3 .413118676576232e-08, 1 .937393818707278e-07, 1 .2980176506971475e-06, 3 .5641004814124244e-08, 2 .149332445355867e-08, 3 .055293689158134e-07, 1 .5532516783878236e-07, 1 .4520978766086046e-06, 3 .488464628276233e-08, 3 .825438398052938e-05, 4 .5088432898410247e-07, 4 .1766969616219285e-07, 6 .770622462681786e-07, 1 .4142248971893423e-07, 1 .4235997696232516e-05, 6 .293820433711517e-07, 4 .762866865348769e-06, 9 .024900577969674e-07, 9 .058987870957935e-07, 1 .5713684433649178e-06, 1 .5720647184025438e-07, 1 .818536503606083e-07, 7 .193188622522939e-08, 1 .1952824934269302e-06, 8 .874837362782273e-07, 2 .0870831463071227e-07, 9 .906239029078279e-08, 7 .793621747964607e-09, 1 .0058498389753368e-07, 4 .2059440374941914e-07, 1 .843624630737395e-07, 1 .6437947181202617e-07, 7 .025352743994517e-08, 2 .570448600636155e-07, 7 .586877615040066e-08, 7 .841313731660193e-07, 2 .495309274763713e-07, 5 .157681925993529e-08, 4 .0674127177453556e-08, 7 .531796519799627e-09, 4 .797485431140558e-08, 1 .7419973019627832e-08, 1 .7958679165985814e-07, 1 .2566392371127222e-08, 8 .975440124459055e-08, 3 .26965476915575e-08, 1 .1208359751435637e-07, 3 .906746215420753e-08, 4 .6769045525252295e-08, 1 .8523553535487736e-07, 1 .4833052830454108e-07, 1 .2279349448363064e-07, 1 .0729105497375713e-06, 3 .6538490011395197e-09, 1 .6198403329781286e-07, 1 .6190719875908144e-08, 1 .2004933580556099e-07, 1 .4800277448046018e-08, 4 .02294837442696e-08, 2 .15060893538066e-07, 1 .1925696696835075e-07, 4 .8982514044837444e-08, 7 .608920071788816e-08, 2 .3137479487900237e-08, 8 .521050176568679e-08, 9 .586213423062873e-08, 1 .3351650807180704e-07, 3 .021699157557123e-08, 4 .423876376336011e-08, 2 .610667060309879e-08, 2 .3977091245797055e-07, 1 .3192564551900432e-07, 1 .6734931662654162e-08, 1 .588336999702733e-07, 4 .0643516285854275e-07, 8 .753454494581092e-08, 8 .366999395548191e-07, 3 .437598650180007e-08, 7 .847892646850596e-08, 8 .526394701391382e-09, 9 .601382799928615e-08, 5 .258924034023948e-07, 1 .3557448141909845e-07, 1 .0307226716577134e-07, 1 .0429813457335513e-08, 5 .187714435805901e-08, 2 .187001335585137e-08, 1 .1791439824548888e-08, 2 .98065643278278e-08, 4 .338393466696289e-08, 2 .9991046091026874e-08, 2 .8507610494443725e-08, 3 .058665143385042e-08, 6 .441099031917474e-08, 1 .5364101102477434e-08, 1 .5973883549236234e-08, 2 .5736850872704053e-08, 1 .0903765712555469e-07, 3 .2118737891551064e-08, 6 .819742992547617e-09, 1 .9251311300649832e-07, 5 .8258109447706374e-08, 1 .8765761922168167e-07, 4 .0070790419122204e-07, 1 .5791577823165426e-08, 1 .950158434738114e-07, 1 .0142063189277906e-08, 2 .744815041921811e-08, 1 .2843531571604672e-08, 3 .7297493094001766e-08, 7 .407496838141014e-08, 4 .20607833007125e-08, 1 .6924804668860816e-08, 1 .459203531339881e-07, 4 .344977000414474e-08, 1 .7191403856031684e-07, 3 .5817443233554513e-08, 8 .440249388286247e-09, 4 .194829728021432e-08, 2 .514032360068086e-08, 2 .8340199520471288e-08, 8 .747196034164517e-08, 8 .277125651545703e-09, 1 .1676293709683705e-08, 1 .4548514570833504e-07, 7 .200282148289716e-09, 2 .623600948936655e-06, 5 .675736929333652e-07, 1 .9483527466945816e-06, 6 .752595282932816e-08, 8 .168475318370838e-08, 1 .0933046468153407e-07, 1 .670913718498923e-07, 3 .1387276777650186e-08, 2 .973524537708272e-08, 5 .752163900751839e-08, 5 .850877471402782e-08, 3 .2544622285968217e-07, 3 .330221431951941e-08, 4 .186786668469722e-07, 1 .5085906568401697e-07, 2 .3346819943981245e-07, 2 .86402780602657e-07, 2 .2940319865938363e-07, 1 .8537603807544656e-07, 3 .151798182443599e-07, 1 .1075967449869495e-06, 1 .5369782602192572e-07, 1 .9237509718550427e-07, 1 .64044664074936e-07, 2 .900835340824415e-07, 1 .246654903752642e-07, 5 .802622027317739e-08, 5 .186220519703966e-08, 6 .0094205167615655e-09, 1 .2333241272699524e-07, 1 .3798474185477971e-07, 1 .7370231830682314e-07, 5 .617761189569137e-07, 5 .1604470030497396e-08, 4 .813277598714194e-08, 8 .032698417537176e-08, 2 .0645263703045202e-06, 5 .638597713186755e-07, 8 .794199857220519e-07, 3 .4785980460583232e-06, 2 .972389268052211e-07, 3 .3904532870110415e-07, 9 .469074058188198e-08, 3 .754845678827223e-08, 1 .5679037801419327e-07, 8 .203105039683578e-08, 6 .847962641387539e-09, 1 .8251624211984563e-08, 6 .050240841659615e-08, 3 .956342808919544e-08, 1 .0699947949888156e-07, 3 .2566634899922065e-07, 3 .5369430406717584e-07, 7 .326295303755614e-08, 4 .85765610847011e-07, 7 .717713401689252e-07, 3 .4567779749750116e-08, 3 .246204585138912e-07, 3 .1608601602783892e-06, 5 .33099466792919e-08, 3 .645687343123427e-07, 5 .48158936908294e-07, 4 .62306957160763e-08, 1 .3466177506415988e-07, 4 .3529482240955986e-08, 1 .6404105451783835e-07, 2 .463695381038633e-08, 5 .958712634424046e-08, 9 .493651020875404e-08, 5 .523462576206839e-08, 5 .7412357534758485e-08, 1 .1850350347231142e-05, 5 .8263944993086625e-06, 7 .4208674050169066e-06, 9 .127966222877149e-07, 2 .0019581370434025e-06, 1 .033498961078294e-06, 3 .5146850763112525e-08, 2 .058995278275688e-06, 3 .5655509122989315e-07, 6 .873234070781109e-08, 2 .1935298022413008e-09, 5 .560363547374436e-08, 3 .3266996979364194e-07, 1 .307369217329324e-07, 2 .718762992515167e-08, 1 .0462929189714032e-08, 7 .466680358447775e-07, 6 .923166040451179e-08, 1 .6145664361033596e-08, 8 .568521003837759e-09, 4 .76221018175238e-09, 1 .233977116044116e-07, 8 .340628632197422e-09, 3 .2649041248333788e-09, 5 .0632489312363305e-09, 4 .0704994930251814e-09, 1 .2043538610839732e-08, 5 .105608380517879e-09, 7 .267142887457112e-09, 1 .184516307262129e-07, 7 .53557927168913e-08, 6 .386964201965384e-08, 1 .6212936770898523e-08, 2 .610429419291904e-07, 6 .979425393183192e-07, 6 .647513117741255e-08, 7 .717492849224072e-07, 6 .651206945207377e-07, 3 .324495310152997e-07, 3 .707282019149716e-07, 3 .99564243025452e-07, 6 .411632114122767e-08, 7 .107352217872176e-08, 1 .6380016631956096e-07, 6 .876800995314625e-08, 3 .462474467141874e-07, 2 .0256503319160402e-07, 6 .19610148078209e-07, 2 .6841073363925716e-08, 6 .720335363752383e-07, 1 .1348340649419697e-06, 1 .8397931853542104e-06, 6 .397251581802266e-07, 7 .257533241045167e-08, 4 .2213909523525217e-07, 3 .9657925299252383e-07, 1 .4037439655112394e-07, 3 .249856774800719e-07, 1 .5857655455420172e-07, 1 .1122217102865761e-07, 7 .391420808744442e-08, 3 .42322238111592e-07, 5 .39796154441774e-08, 8 .517296379295658e-08, 4 .061009803990601e-06, 1 .4478755474556237e-05, 7 .317032757470088e-09, 6 .9484960008026064e-09, 4 .468917325084476e-08, 9 .23141172393116e-08, 5 .411982328951126e-08, 2 .2242811326123046e-07, 1 .7609554703312824e-08, 2 .0906279374344194e-08, 3 .6797682678724186e-09, 6 .177919686933819e-08, 1 .7920288541972695e-07, 2 .6279179721200308e-08, 2 .6988200119149042e-08, 1 .6432807115052128e-07, 1 .2827612749788386e-07, 4 .468908798571647e-08, 6 .316552969565237e-08, 1 .9461760203398626e-08, 2 .087125849925542e-08, 2 .2414580413965268e-08, 2 .4765244077684656e-08, 6 .785398465325443e-09, 2 .4248794971981624e-08, 4 .554979504689527e-09, 2 .8977037658250993e-08, 2 .0402325162649504e-08, 1 .600950270130852e-07, 2 .0199709638291097e-07, 1 .611188515937556e-08, 5 .964113825029926e-08, 4 .098318573397819e-09, 3 .9080127578472457e-08, 7 .511338218080255e-09, 5 .965624154669058e-07, 1 .6478223585636442e-07, 1 .4106989354445432e-08, 3 .2855584919389e-08, 3 .3387166364917675e-09, 1 .220043444050134e-08, 4 .624639160510924e-08, 6 .842309385746148e-09, 1 .74262879681919e-08, 4 .6611329906909305e-08, 9 .331947836699328e-08, 1 .2306078644996887e-07, 1 .2359445022980253e-08, 1 .1173199254699284e-08, 2 .7724862405875683e-08, 2 .419210147763806e-07, 3 .451186785241589e-07, 2 .593766978975509e-08, 9 .964568192799561e-08, 9 .797809674694236e-09, 1 .9085564417764544e-07, 3 .972706252852731e-08, 2 .6639204619982593e-08, 6 .874148805735558e-09, 3 .146993776681484e-08, 2 .4086594407890516e-07, 1 .3126927456141857e-07, 2 .1254339799270383e-07, 2 .050203384840188e-08, 3 .694976058454813e-08, 6 .563175816154398e-07, 2 .560050127442537e-08, 2 .6882981174480847e-08, 6 .880636078676616e-07, 2 .0092733166166e-07, 2 .788039665801989e-08, 2 .628409134786125e-08, 5 .1678345158734373e-08, 1 .8935413947929192e-07, 4 .61852835087484e-07, 1 .1086777718105623e-08, 1 .4542604276357451e-07, 2 .8737009216683873e-08, 6 .105167926762078e-07, 1 .2016463379893594e-08, 1 .3944705301582871e-07, 2 .093712758721722e-08, 4 .3801410498645055e-08, 1 .966320795077081e-08, 6 .654448991838535e-09, 1 .1149590584125235e-08, 6 .424939158478082e-08, 6 .971554888934861e-09, 3 .260019587614238e-09, 1 .4260189473702667e-08, 2 .7895078247297533e-08, 8 .11578289017234e-08, 2 .5995715802196173e-08, 2 .2855578762914774e-08, 1 .055962854934478e-07, 8 .145542551574181e-08, 3 .7793686402665116e-08, 4 .881891513264236e-08, 2 .342062366267328e-08, 1 .059935517133681e-08, 3 .604105103249822e-08, 5 .062430830093945e-08, 3 .6804440384230475e-08, 1 .501580193519203e-09, 1 .4475033367489232e-06, 1 .076210423889279e-06, 1 .304991315009829e-07, 3 .073601462233455e-08, 1 .7184021317007137e-08, 2 .0421090596300928e-08, 7 .904992216367646e-09, 1 .6902052379919041e-07, 1 .2416506933732308e-08, 5 .4758292122869534e-08, 2 .6250422280327257e-08, 1 .3261367115546818e-08, 6 .29807459517906e-08, 1 .270998595259698e-08, 2 .0171681569536304e-07, 4 .386637186826192e-08, 6 .962349630157405e-08, 2 .9565120485131047e-07, 7 .925131626507209e-07, 2 .0868920103112032e-07, 1 .7341794489311724e-07, 4 .2942417621816276e-08, 4 .213406956665722e-09, 8 .824785169281313e-08, 1 .7341569957807224e-08, 7 .321587247588468e-08, 1 .7941774288487977e-08, 1 .1245148101579616e-07, 4 .242405395871174e-07, 8 .259573469615589e-09, 1 .1336403105133286e-07, 8 .268798978861014e-08, 2 .2186977588489754e-08, 1 .9539720952366224e-08, 1 .0675703876472653e-08, 3 .288517547161973e-08, 2 .4340963022950746e-08, 6 .639137239972115e-08, 5 .604687380866835e-09, 1 .386604697728444e-08, 6 .675873720496384e-08, 1 .1355886009312144e-08, 3 .132159633878473e-07, 3 .12451788886392e-08, 1 .502181845580708e-07, 1 .3461754377885882e-08, 1 .8882955998833495e-07, 4 .645742279762999e-08, 4 .6453880742092224e-08, 7 .714453964524637e-09, 3 .5857155467056145e-08, 7 .60832108426257e-09, 4 .221501370693659e-08, 4 .3407251126836854e-09, 1 .340157496088068e-08, 8 .565600495558101e-08, 1 .7045413969185574e-08, 5 .4221903411644234e-08, 3 .021912675649219e-08, 6 .153376119755194e-08, 3 .938857240370908e-09, 4 .135628017820636e-08, 1 .781920389021252e-08, 4 .3105885083605244e-08, 3 .903354972578654e-09, 7 .663085455078544e-08, 1 .1890405993142394e-08, 9 .304217840622186e-09, 1 .0968062014171664e-09, 1 .0536767902635802e-08, 1 .1516804221400889e-07, 8 .134522886393825e-07, 5 .952623993721318e-08, 2 .806350174466843e-08, 1 .2833099027886874e-08, 1 .0605690192733164e-07, 7 .872949936427176e-07, 2 .7501393162765453e-08, 3 .936289072470345e-09, 2 .0519442145428002e-08, 7 .394815870753746e-09, 3 .598397313453461e-08, 2 .5378517065632877e-08, 4 .698972233541099e-08, 7 .54952989012736e-09, 6 .322805461422831e-07, 5 .582006412652163e-09, 1 .29640980617296e-07, 1 .5874988434916304e-08, 3 .3837810775594335e-08, 6 .474512037613067e-09, 9 .121148281110436e-08, 1 .3918511676536127e-08, 8 .230025549949005e-09, 2 .7061290097663004e-08, 2 .6095918315149902e-08, 5 .722363471960534e-09, 6 .963475698285038e-07, 4 .685091781198025e-08, 9 .590579885809802e-09, 2 .099205858030473e-07, 3 .082160660028421e-08, 3 .563162565001221e-08, 7 .326312925215461e-07, 2 .1759731225756695e-06, 2 .407518309155421e-07, 2 .974515780351794e-07, 2 .529018416908002e-08, 7 .667950718825978e-09, 2 .663289251358947e-07, 3 .4358880185436647e-08, 2 .3130198201215535e-08, 3 .1239693498719134e-08, 2 .8691621878351725e-07, 3 .895845068768722e-08, 2 .4184130253956937e-08, 1 .1582445225144511e-08, 5 .1545349322168477e-08, 2 .034345492063494e-08, 8 .201963197507212e-08, 1 .164153573540716e-08, 5 .496356720868789e-07, 1 .1682151246361627e-08, 4 .7576914852243135e-08, 1 .6349824605299546e-08, 4 .090862759653646e-08, 2 .1271189609706198e-07, 1 .6697286753242224e-07, 3 .989708119433999e-08, 2 .852450279533514e-06, 1 .2500372292834072e-07, 2 .4846613655427063e-07, 1 .245429093188477e-08, 2 .9700272463628608e-08, 4 .250991558762962e-09, 1 .61443480806156e-07, 2 .6386018703306036e-07, 7 .638056409575711e-09, 3 .4455793773702226e-09, 7 .273289526210647e-08, 1 .7631434090503717e-08, 7 .58661311550668e-09, 2 .1547013062672704e-08, 1 .2675349125856883e-07, 2 .5637149292379036e-08, 3 .500976220038865e-08, 6 .472243541111311e-08, 8 .387915251262257e-09, 3 .069512288789156e-08, 7 .520387867998579e-08, 1 .5724964441687916e-07, 1 .9634005354873807e-07, 1 .2290831818972947e-07, 1 .112118730439704e-09, 1 .546895944670723e-08, 9 .91701032404535e-09, 6 .882473257974198e-07, 8 .267616635748709e-08, 4 .469531234008173e-08, 2 .075201344098332e-08, 8 .649378457903367e-08, 5 .202766573120243e-08, 4 .5564942041664835e-08, 2 .0319955496006514e-08, 8 .705182352741758e-09, 6 .452066969586667e-08, 2 .1777438519166026e-08, 1 .030954166481024e-08, 3 .211904342492744e-08, 2 .3336936294526822e-07, 8 .054096056753224e-09, 1 .9623354319264763e-07, 1 .2888089884199871e-07, 1 .5392496166555247e-08, 1 .401903038100727e-09, 5 .696818305978013e-08, 6 .080025372057207e-09, 1 .0782793324892737e-08, 2 .4260730313585555e-08, 1 .9388659566743627e-08, 2 .2970310453729326e-07, 1 .9971754028347277e-08, 2 .8477993296860404e-08, 5 .2273552597625894e-08, 2 .7392806600801123e-07, 9 .857291161097237e-08, 3 .12910977129377e-08, 4 .151442212219081e-08, 5 .251196366629074e-09, 1 .580681100676884e-06, 8 .547603442821128e-07, 1 .068913135782168e-08, 1 .0621830597301596e-06, 7 .737313012512459e-08, 6 .394216711669287e-08, 1 .1698345758759388e-07, 1 .0486609625104393e-07, 2 .1161000063329993e-07, 1 .53396815250062e-08, 5 .094453570109181e-08, 1 .4005379966874898e-08, 2 .6282036102998063e-08, 8 .778433624456738e-08, 7 .772066545896905e-09, 4 .228875383205377e-08, 3 .3243779284930497e-07, 7 .729244799747903e-08, 7 .636901111496286e-10, 5 .989500806435899e-08, 1 .326090597331131e-07, 1 .2853634245857393e-07, 8 .844242671557367e-09, 1 .0194374766570036e-07, 2 .493779334145074e-07, 1 .6547971881664125e-07, 1 .1762754326127833e-08, 1 .1496195639892903e-07, 2 .9342709240154363e-07, 1 .326124099421122e-08, 8 .630262726683213e-08, 5 .7394842656322e-08, 1 .1094081031615133e-07, 2 .2933713239581266e-07, 3 .4706170026765903e-07, 1 .4751107357824367e-07, 1 .502495017291494e-08, 6 .454319390059027e-08, 5 .164533689594464e-08, 6 .23741556182722e-08, 1 .293601457064142e-07, 1 .4052071506398534e-08, 5 .386946000385251e-08, 2 .0827554791935654e-08, 1 .3040637902861363e-08, 1 .0578981601838677e-07, 1 .5079727688771527e-08, 8 .92632726845477e-07, 4 .6374381668101705e-08, 7 .481006036869076e-07, 5 .883147302654379e-09, 2 .8707685117979054e-09, 8 .381598490814213e-07, 7 .341958596640552e-09, 1 .4245998158912698e-08, 1 .0926417104428765e-07, 1 .1308178216040687e-07, 2 .52339901862797e-07, 1 .1782835684925885e-07, 4 .6678056975224536e-08, 2 .7959197179683315e-09, 3 .4363861090014325e-08, 1 .4674496640054713e-07, 3 .5396915620822256e-08, 2 .0581127557761647e-07, 7 .18387909159901e-08, 2 .7693943138729082e-08, 4 .5493386835460115e-08, 1 .9559182717898693e-08, 1 .5359708172013598e-08, 1 .2336623278486059e-08, 2 .9570605519779747e-08, 2 .877552560676122e-07, 9 .051845495378075e-07, 2 .3732602016934834e-07, 1 .6521676471370483e-08, 1 .5478875070584763e-08, 3 .526786329643983e-08, 3 .616410637619083e-08, 1 .61590953950963e-08, 7 .65007328595857e-08, 1 .9661483108279754e-08, 4 .917534823789538e-08, 1 .1712612746350715e-07, 1 .0889253054813253e-08, 1 .494120169809321e-06, 1 .018585660261806e-08, 3 .7575969003000864e-08, 2 .097097784314883e-08, 3 .368558054717141e-08, 4 .845588819080149e-09, 6 .039624622644624e-07, 1 .037331109898787e-08, 2 .841650257323636e-07, 4 .4990630954089283e-07, 3 .463186004637464e-08, 7 .720684180867465e-08, 1 .471122175189521e-07, 1 .1601575522490748e-07, 4 .007488030310924e-07, 3 .025649775167949e-08, 6 .706784461130155e-08, 2 .0128741340386114e-08, 1 .5987744461654074e-09, 4 .1919822280078733e-08, 1 .3167154477855547e-08, 3 .231814815762846e-08, 9 .247659704669786e-08, 1 .3075300842047e-07, 1 .0574301256838226e-07, 3 .762165334819656e-08, 1 .0942246575496029e-07, 7 .001474955359299e-08, 2 .742706151082075e-08, 2 .0766625752344225e-08, 4 .5403403703403455e-08, 3 .39040298058535e-08, 1 .0469661759771043e-07, 2 .8271578855765256e-08, 3 .406226767310727e-07, 5 .146206945028098e-07, 6 .740708613506285e-07, 6 .382248063374618e-09, 3 .63878704945364e-08, 3 .626059807970705e-08, 1 .6065602892467723e-07, 3 .639055989879125e-07, 6 .232691696084203e-09, 4 .805490050330263e-08, 3 .372633727849461e-08, 6 .328880317596486e-07, 6 .480631498106959e-08, 2 .1165197949812864e-07, 8 .38779143919055e-08, 1 .7589144363228115e-08, 2 .729027670511641e-09, 2 .144795097080987e-08, 7 .861271456022223e-08, 2 .0118186228046397e-08, 2 .8407685093156942e-08, 2 .4922530883486615e-07, 2 .0156670998972004e-08, 2 .6551649767725394e-08, 2 .7848242822869906e-08, 6 .907123761834555e-09, 1 .880543720744754e-08, 1 .3006903998302732e-08, 3 .685918272822164e-07, 3 .967941211158177e-07, 2 .7592133022835696e-08, 2 .5228947819755376e-08, 1 .547002881352455e-07, 3 .689306637966183e-08, 1 .440177199718562e-09, 2 .1504929392790473e-08, 5 .068111263994979e-08, 5 .081711407228795e-08, 1 .171875219085905e-08, 5 .409278358570191e-08, 7 .138276600926474e-07, 2 .5237213208129106e-07, 7 .072044638789521e-08, 7 .199763984999663e-08, 1 .2525473103153217e-08, 3 .4803417747752974e-07, 1 .9591827538079087e-07, 1 .2404700555634918e-07, 1 .234617457157583e-07, 1 .9201337408958352e-08, 1 .9895249181445251e-07, 3 .7876677794201896e-08, 1 .0629785052174157e-08, 1 .2437127772102485e-08, 2 .1861892207653e-07, 2 .6181456291851646e-07, 1 .112900775979142e-07, 1 .0776630432474121e-07, 6 .380325157095967e-09, 3 .895085143312826e-09, 1 .5762756788717525e-07, 2 .909027019271093e-09, 1 .0381050685737137e-08, 2 .8135211493918177e-08, 1 .0778002490496874e-08, 1 .3605974125141529e-08, 2 .9236465692861202e-08, 1 .9189795352758665e-07, 2 .199506354827463e-07, 1 .326399790002597e-08, 4 .9004846403022384e-08, 2 .980837132682268e-09, 8 .926045680368588e-09, 1 .0996975774446582e-08, 7 .71560149104289e-09, 7 .454491246505768e-09, 5 .086162246925596e-08, 1 .5129764108223753e-07, 1 .1960075596562092e-08, 1 .1323334270230134e-08, 9 .391332156383214e-09, 9 .585701832293125e-08, 1 .905532798218701e-08, 1 .8105303922766325e-08, 6 .179227796110354e-08, 6 .389401363549041e-08, 1 .1853179771037503e-08, 9 .37277544466042e-09, 1 .2332148457971925e-07, 1 .6522022860954166e-08, 1 .246116454467483e-07, 4 .196171854431441e-09, 3 .996593278543514e-08, 1 .2554556505506298e-08, 1 .4302138140465104e-08, 6 .631793780798034e-09, 5 .964224669696705e-09, 5 .556936244488497e-09, 1 .4192455921602232e-07, 1 .7613080771639034e-08, 3 .380189639301534e-07, 7 .85651934620546e-08, 2 .966783085867064e-08, 2 .8992105853831163e-06, 1 .3787366697215475e-06, 5 .313622430946907e-09, 2 .512852859126724e-08, 8 .406627216572815e-08, 4 .492839167369311e-08, 5 .408793057881667e-08, 2 .4239175999696272e-08, 4 .016805235096399e-07, 4 .1083545454512205e-08, 5 .4153481698904216e-08, 8 .640767212853007e-09, 5 .773256717134245e-08, 2 .6443152023603034e-07, 8 .953217047746875e-07, 2 .7994001783326894e-08, 5 .889480014786841e-09, 4 .1788819515886644e-08, 2 .8880645430717777e-08, 2 .135752907861388e-08, 2 .3024175277441827e-07, 8 .786625471657317e-08, 2 .0697297209437693e-09, 2 .236410523437371e-08, 3 .203276310870251e-09, 1 .176874686592555e-08, 6 .963571053120177e-08, 2 .271932153519174e-08, 7 .360382525689602e-09, 6 .922528772435044e-09, 3 .213871480056696e-08, 1 .370577820125618e-07, 1 .9815049157045905e-08, 1 .0578956377571558e-08, 2 .7049420481262132e-08, 2 .9755937713815683e-09, 2 .1773699288019088e-08, 1 .09755387001087e-08, 1 .991872444762066e-08, 2 .3882098076910552e-08, 2 .1357365653784655e-08, 6 .109098560358461e-09, 1 .1890497475519624e-08, 1 .1459891702259029e-08, 3 .73173456580389e-08, 1 .572620256240498e-08, 3 .404023374287135e-08, 3 .6921580459647885e-08, 9 .281765045443535e-08, 1 .2323201303843234e-07, 4 .2347593876002065e-08, 1 .7423728237986325e-08, 5 .8113389656000436e-08, 3 .931436154402945e-08, 2 .3690461148362374e-08, 1 .792850135018398e-08, 1 .440664210150544e-08, 7 .019830494670032e-09, 6 .041522482291839e-08, 4 .867479930226182e-08, 1 .0685319296044327e-08, 1 .0051243393149889e-08, 4 .2426261614991745e-08, 2 .607815297039906e-08, 5 .136670200300 841e-09, 1 .69729952315123e-09, 1 .9131586981302462e-08, 2 .111743526711507e-07, 1 .337269672774255e-08, 2 .0002481448955223e-08, 1 .0454256482717028e-07, 2 .8144228281234973e-08, 2 .1344791889532644e-07, 2 .1046110632028103e-08, 1 .9114453664315079e-07, 3 .957693550660224e-08, 2 .931631826186276e-08, 1 .105203111251285e-07, 4 .84007678380749e-08, 5 .583606110803885e-08, 1 .2130111315400427e-07, 1 .77621615193857e-08, 2 .5610853882085394e-08, 1 .203865309662433e-07, 4 .674859610531712e-09, 1 .5916098661250544e-08, 3 .147594185293201e-08, 6 .147686093527227e-08, 2 .204641802450169e-08, 3 .257763410147163e-07, 1 .198914532096751e-07, 2 .3818989802748547e-07, 1 .4909986134625797e-08, 5 .10168831624469e-08, 5 .5142201915714395e-08, 2 .288550327023131e-08, 5 .714110073995471e-08, 5 .185095801607531e-07, 4 .977285783525076e-08, 1 .1049896109227575e-08, 1 .264099296349741e-07, 8 .174881571676451e-08 ]]} * Connection #0 to host localhost left intact Deploy the model with Open Inference Protocol \u00b6 Test the Model locally \u00b6 Once you've got your model serialised model.pdmodel , we can then use KServe Paddle Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Using KServe PaddleServer \u00b6 Pre-requisites \u00b6 Firstly, to use KServe Paddle server locally, you will first need to install the paddleserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install paddleserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/paddleserver poetry install Serving model locally \u00b6 The paddleserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the paddleserver runtime package installed locally, you should now be ready to start our server as: python3 paddleserver --model_dir /path/to/model_dir --model_name paddle-v2-resnet50 Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f paddle-v2-resnet.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can use the example payload jay-v2.json as the sample input to test the model. Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/paddle-v2-resnet50/infer Expected Output { \"model_name\" : \"paddle-v2-resnet50\" , \"model_version\" : null , \"id\" : \"afa5ec0b-a5c7-454b-a464-53ba931b22df\" , \"parameters\" : null , \"outputs\" :[{ \"name\" : \"output-0\" , \"shape\" :[ 1 , 1000 ], \"datatype\" : \"FP32\" , \"parameters\" : null , \"data\" :[ 6.736678770380422e-9 , 1.1535990829258935e-8 , 5.142250714129659e-8 , 6.647170636142619e-8 , 4.094492567219277e-8 , 1.3402451770616608e-7 , 9.355561303436843e-8 , 2.8935891904779965e-8 , 6.845367295227334e-8 , 7.680615965455218e-8 , 0.0000020334689452283783 , 0.0000011085678579547675 , 2.3477592492326949e-7 , 6.582037030966603e-7 , 0.00012373103527352214 , 4.2878804151769145e-7 , 0.000006419959845516132 , 0.9993496537208557 , 0.00007372002437477931 , 0.00003101135735050775 , 0.0000056028093240456656 , 0.0000021862508674530545 , 1.9544044604913324e-8 , 3.728893887000595e-7 , 4.2903633357127546e-7 , 1.8251179767503345e-7 , 7.159925985433802e-8 , 9.231618136595898e-9 , 6.469241498052725e-7 , 7.031690341108288e-9 , 4.451231561120039e-8 , 1.2455971898361895e-7 , 9.44632745358831e-8 , 4.347704418705689e-8 , 4.658220120745682e-7 , 6.797721141538204e-8 , 2.1060276367279585e-7 , 2.2605123106700376e-8 , 1.4311490303953178e-7 , 7.951298641728499e-8 , 1.2341783417468832e-7 , 0.0000010921713737843675 , 0.000015243892448779661 , 3.1173343018053856e-7 , 2.4152058131221565e-7 , 6.863762536113427e-8 , 8.467682022228473e-8 , 9.4246772164297e-8 , 1.0219210366813058e-8 , 3.3770753304906975e-8 , 3.6928835100979995e-8 , 1.3694031508748594e-7 , 1.0674284567357972e-7 , 2.599483650556067e-7 , 3.4866405940192635e-7 , 3.132053549848024e-8 , 3.574873232992104e-7 , 6.64843895492595e-8 , 3.1638955988455564e-7 , 0.0000012095878219042788 , 8.66409024524728e-8 , 4.0144172430700564e-8 , 1.2544761318622477e-7 , 3.3201178695208e-8 , 1.9731444922399533e-7 , 3.806405572959193e-7 , 1.3827865075199952e-7 , 2.300225965257141e-8 , 7.14422512260171e-8 , 2.851114544455413e-8 , 2.982567437470607e-8 , 8.936032713791064e-8 , 6.22388370175031e-7 , 6.478838798784636e-8 , 1.3663023423760023e-7 , 9.973181391842445e-8 , 2.5761554667269593e-8 , 4.130220077058766e-8 , 3.9384463690339544e-8 , 1.2158079698565416e-7 , 0.000004302821707824478 , 0.0000018179063090428826 , 0.0000018520155435908237 , 0.0000016246107179540559 , 0.000016448313544970006 , 0.000010544916221988387 , 0.000003993061909568496 , 2.646479799750523e-7 , 0.000019193475964129902 , 4.803242745765601e-7 , 1.696285067964709e-7 , 0.000004550505764200352 , 0.00004235929372953251 , 0.000004443338639248395 , 0.000005104009687784128 , 0.000013506396498996764 , 4.1758724478313525e-7 , 4.494491463447048e-7 , 3.156698369366495e-7 , 0.0000010557599807725637 , 1.336463917311903e-8 , 1.3893659556174498e-8 , 6.770379457066156e-8 , 1.4129696523923485e-7 , 7.170518756538513e-8 , 7.934466594861078e-8 , 2.639154317307657e-8 , 2.6134321373660896e-8 , 7.196725881897237e-9 , 2.1752363466021052e-8 , 6.684639686227456e-8 , 3.417795824134373e-8 , 1.6228275967478112e-7 , 4.107114648377319e-7 , 6.472135396506928e-7 , 2.951379372007068e-7 , 5.653474133282543e-9 , 4.830144462175667e-8 , 8.887481861563629e-9 , 3.7306168820805397e-8 , 1.7784264727538357e-8 , 4.641905082536368e-9 , 3.413118676576232e-8 , 1.937393818707278e-7 , 0.0000012980176506971475 , 3.5641004814124244e-8 , 2.149332445355867e-8 , 3.055293689158134e-7 , 1.5532516783878236e-7 , 0.0000014520978766086046 , 3.488464628276233e-8 , 0.00003825438398052938 , 4.5088432898410247e-7 , 4.1766969616219285e-7 , 6.770622462681786e-7 , 1.4142248971893423e-7 , 0.000014235997696232516 , 6.293820433711517e-7 , 0.000004762866865348769 , 9.024900577969674e-7 , 9.058987870957935e-7 , 0.0000015713684433649178 , 1.5720647184025438e-7 , 1.818536503606083e-7 , 7.193188622522939e-8 , 0.0000011952824934269302 , 8.874837362782273e-7 , 2.0870831463071227e-7 , 9.906239029078279e-8 , 7.793621747964607e-9 , 1.0058498389753368e-7 , 4.2059440374941914e-7 , 1.843624630737395e-7 , 1.6437947181202617e-7 , 7.025352743994517e-8 , 2.570448600636155e-7 , 7.586877615040066e-8 , 7.841313731660193e-7 , 2.495309274763713e-7 , 5.157681925993529e-8 , 4.0674127177453556e-8 , 7.531796519799627e-9 , 4.797485431140558e-8 , 1.7419973019627832e-8 , 1.7958679165985814e-7 , 1.2566392371127222e-8 , 8.975440124459055e-8 , 3.26965476915575e-8 , 1.1208359751435637e-7 , 3.906746215420753e-8 , 4.6769045525252295e-8 , 1.8523553535487736e-7 , 1.4833052830454108e-7 , 1.2279349448363064e-7 , 0.0000010729105497375713 , 3.6538490011395197e-9 , 1.6198403329781286e-7 , 1.6190719875908144e-8 , 1.2004933580556099e-7 , 1.4800277448046018e-8 , 4.02294837442696e-8 , 2.15060893538066e-7 , 1.1925696696835075e-7 , 4.8982514044837444e-8 , 7.608920071788816e-8 , 2.3137479487900237e-8 , 8.521050176568679e-8 , 9.586213423062873e-8 , 1.3351650807180704e-7 , 3.021699157557123e-8 , 4.423876376336011e-8 , 2.610667060309879e-8 , 2.3977091245797055e-7 , 1.3192564551900432e-7 , 1.6734931662654162e-8 , 1.588336999702733e-7 , 4.0643516285854275e-7 , 8.753454494581092e-8 , 8.366999395548191e-7 , 3.437598650180007e-8 , 7.847892646850596e-8 , 8.526394701391382e-9 , 9.601382799928615e-8 , 5.258924034023948e-7 , 1.3557448141909845e-7 , 1.0307226716577134e-7 , 1.0429813457335513e-8 , 5.187714435805901e-8 , 2.187001335585137e-8 , 1.1791439824548888e-8 , 2.98065643278278e-8 , 4.338393466696289e-8 , 2.9991046091026874e-8 , 2.8507610494443725e-8 , 3.058665143385042e-8 , 6.441099031917474e-8 , 1.5364101102477434e-8 , 1.5973883549236234e-8 , 2.5736850872704053e-8 , 1.0903765712555469e-7 , 3.2118737891551064e-8 , 6.819742992547617e-9 , 1.9251311300649832e-7 , 5.8258109447706374e-8 , 1.8765761922168167e-7 , 4.0070790419122204e-7 , 1.5791577823165426e-8 , 1.950158434738114e-7 , 1.0142063189277906e-8 , 2.744815041921811e-8 , 1.2843531571604672e-8 , 3.7297493094001766e-8 , 7.407496838141014e-8 , 4.20607833007125e-8 , 1.6924804668860816e-8 , 1.459203531339881e-7 , 4.344977000414474e-8 , 1.7191403856031684e-7 , 3.5817443233554513e-8 , 8.440249388286247e-9 , 4.194829728021432e-8 , 2.514032360068086e-8 , 2.8340199520471288e-8 , 8.747196034164517e-8 , 8.277125651545703e-9 , 1.1676293709683705e-8 , 1.4548514570833504e-7 , 7.200282148289716e-9 , 0.000002623600948936655 , 5.675736929333652e-7 , 0.0000019483527466945816 , 6.752595282932816e-8 , 8.168475318370838e-8 , 1.0933046468153407e-7 , 1.670913718498923e-7 , 3.1387276777650186e-8 , 2.973524537708272e-8 , 5.752163900751839e-8 , 5.850877471402782e-8 , 3.2544622285968217e-7 , 3.330221431951941e-8 , 4.186786668469722e-7 , 1.5085906568401697e-7 , 2.3346819943981245e-7 , 2.86402780602657e-7 , 2.2940319865938363e-7 , 1.8537603807544656e-7 , 3.151798182443599e-7 , 0.0000011075967449869495 , 1.5369782602192572e-7 , 1.9237509718550427e-7 , 1.64044664074936e-7 , 2.900835340824415e-7 , 1.246654903752642e-7 , 5.802622027317739e-8 , 5.186220519703966e-8 , 6.0094205167615655e-9 , 1.2333241272699524e-7 , 1.3798474185477971e-7 , 1.7370231830682314e-7 , 5.617761189569137e-7 , 5.1604470030497396e-8 , 4.813277598714194e-8 , 8.032698417537176e-8 , 0.0000020645263703045202 , 5.638597713186755e-7 , 8.794199857220519e-7 , 0.0000034785980460583232 , 2.972389268052211e-7 , 3.3904532870110415e-7 , 9.469074058188198e-8 , 3.754845678827223e-8 , 1.5679037801419327e-7 , 8.203105039683578e-8 , 6.847962641387539e-9 , 1.8251624211984563e-8 , 6.050240841659615e-8 , 3.956342808919544e-8 , 1.0699947949888156e-7 , 3.2566634899922065e-7 , 3.5369430406717584e-7 , 7.326295303755614e-8 , 4.85765610847011e-7 , 7.717713401689252e-7 , 3.4567779749750116e-8 , 3.246204585138912e-7 , 0.0000031608601602783892 , 5.33099466792919e-8 , 3.645687343123427e-7 , 5.48158936908294e-7 , 4.62306957160763e-8 , 1.3466177506415988e-7 , 4.3529482240955986e-8 , 1.6404105451783835e-7 , 2.463695381038633e-8 , 5.958712634424046e-8 , 9.493651020875404e-8 , 5.523462576206839e-8 , 5.7412357534758485e-8 , 0.000011850350347231142 , 0.0000058263944993086625 , 0.0000074208674050169066 , 9.127966222877149e-7 , 0.0000020019581370434025 , 0.000001033498961078294 , 3.5146850763112525e-8 , 0.000002058995278275688 , 3.5655509122989315e-7 , 6.873234070781109e-8 , 2.1935298022413008e-9 , 5.560363547374436e-8 , 3.3266996979364194e-7 , 1.307369217329324e-7 , 2.718762992515167e-8 , 1.0462929189714032e-8 , 7.466680358447775e-7 , 6.923166040451179e-8 , 1.6145664361033596e-8 , 8.568521003837759e-9 , 4.76221018175238e-9 , 1.233977116044116e-7 , 8.340628632197422e-9 , 3.2649041248333788e-9 , 5.0632489312363305e-9 , 4.0704994930251814e-9 , 1.2043538610839732e-8 , 5.105608380517879e-9 , 7.267142887457112e-9 , 1.184516307262129e-7 , 7.53557927168913e-8 , 6.386964201965384e-8 , 1.6212936770898523e-8 , 2.610429419291904e-7 , 6.979425393183192e-7 , 6.647513117741255e-8 , 7.717492849224072e-7 , 6.651206945207377e-7 , 3.324495310152997e-7 , 3.707282019149716e-7 , 3.99564243025452e-7 , 6.411632114122767e-8 , 7.107352217872176e-8 , 1.6380016631956096e-7 , 6.876800995314625e-8 , 3.462474467141874e-7 , 2.0256503319160402e-7 , 6.19610148078209e-7 , 2.6841073363925716e-8 , 6.720335363752383e-7 , 0.0000011348340649419697 , 0.0000018397931853542104 , 6.397251581802266e-7 , 7.257533241045167e-8 , 4.2213909523525217e-7 , 3.9657925299252383e-7 , 1.4037439655112394e-7 , 3.249856774800719e-7 , 1.5857655455420172e-7 , 1.1122217102865761e-7 , 7.391420808744442e-8 , 3.42322238111592e-7 , 5.39796154441774e-8 , 8.517296379295658e-8 , 0.000004061009803990601 , 0.000014478755474556237 , 7.317032757470088e-9 , 6.9484960008026064e-9 , 4.468917325084476e-8 , 9.23141172393116e-8 , 5.411982328951126e-8 , 2.2242811326123046e-7 , 1.7609554703312824e-8 , 2.0906279374344194e-8 , 3.6797682678724186e-9 , 6.177919686933819e-8 , 1.7920288541972695e-7 , 2.6279179721200308e-8 , 2.6988200119149042e-8 , 1.6432807115052128e-7 , 1.2827612749788386e-7 , 4.468908798571647e-8 , 6.316552969565237e-8 , 1.9461760203398626e-8 , 2.087125849925542e-8 , 2.2414580413965268e-8 , 2.4765244077684656e-8 , 6.785398465325443e-9 , 2.4248794971981624e-8 , 4.554979504689527e-9 , 2.8977037658250993e-8 , 2.0402325162649504e-8 , 1.600950270130852e-7 , 2.0199709638291097e-7 , 1.611188515937556e-8 , 5.964113825029926e-8 , 4.098318573397819e-9 , 3.9080127578472457e-8 , 7.511338218080255e-9 , 5.965624154669058e-7 , 1.6478223585636442e-7 , 1.4106989354445432e-8 , 3.2855584919389e-8 , 3.3387166364917675e-9 , 1.220043444050134e-8 , 4.624639160510924e-8 , 6.842309385746148e-9 , 1.74262879681919e-8 , 4.6611329906909305e-8 , 9.331947836699328e-8 , 1.2306078644996887e-7 , 1.2359445022980253e-8 , 1.1173199254699284e-8 , 2.7724862405875683e-8 , 2.419210147763806e-7 , 3.451186785241589e-7 , 2.593766978975509e-8 , 9.964568192799561e-8 , 9.797809674694236e-9 , 1.9085564417764544e-7 , 3.972706252852731e-8 , 2.6639204619982593e-8 , 6.874148805735558e-9 , 3.146993776681484e-8 , 2.4086594407890516e-7 , 1.3126927456141857e-7 , 2.1254339799270383e-7 , 2.050203384840188e-8 , 3.694976058454813e-8 , 6.563175816154398e-7 , 2.560050127442537e-8 , 2.6882981174480847e-8 , 6.880636078676616e-7 , 2.0092733166166e-7 , 2.788039665801989e-8 , 2.628409134786125e-8 , 5.1678345158734373e-8 , 1.8935413947929192e-7 , 4.61852835087484e-7 , 1.1086777718105623e-8 , 1.4542604276357451e-7 , 2.8737009216683873e-8 , 6.105167926762078e-7 , 1.2016463379893594e-8 , 1.3944705301582871e-7 , 2.093712758721722e-8 , 4.3801410498645055e-8 , 1.966320795077081e-8 , 6.654448991838535e-9 , 1.1149590584125235e-8 , 6.424939158478082e-8 , 6.971554888934861e-9 , 3.260019587614238e-9 , 1.4260189473702667e-8 , 2.7895078247297533e-8 , 8.11578289017234e-8 , 2.5995715802196173e-8 , 2.2855578762914774e-8 , 1.055962854934478e-7 , 8.145542551574181e-8 , 3.7793686402665116e-8 , 4.881891513264236e-8 , 2.342062366267328e-8 , 1.059935517133681e-8 , 3.604105103249822e-8 , 5.062430830093945e-8 , 3.6804440384230475e-8 , 1.501580193519203e-9 , 0.0000014475033367489232 , 0.000001076210423889279 , 1.304991315009829e-7 , 3.073601462233455e-8 , 1.7184021317007137e-8 , 2.0421090596300928e-8 , 7.904992216367646e-9 , 1.6902052379919041e-7 , 1.2416506933732308e-8 , 5.4758292122869534e-8 , 2.6250422280327257e-8 , 1.3261367115546818e-8 , 6.29807459517906e-8 , 1.270998595259698e-8 , 2.0171681569536304e-7 , 4.386637186826192e-8 , 6.962349630157405e-8 , 2.9565120485131047e-7 , 7.925131626507209e-7 , 2.0868920103112032e-7 , 1.7341794489311724e-7 , 4.2942417621816276e-8 , 4.213406956665722e-9 , 8.824785169281313e-8 , 1.7341569957807224e-8 , 7.321587247588468e-8 , 1.7941774288487977e-8 , 1.1245148101579616e-7 , 4.242405395871174e-7 , 8.259573469615589e-9 , 1.1336403105133286e-7 , 8.268798978861014e-8 , 2.2186977588489754e-8 , 1.9539720952366224e-8 , 1.0675703876472653e-8 , 3.288517547161973e-8 , 2.4340963022950746e-8 , 6.639137239972115e-8 , 5.604687380866835e-9 , 1.386604697728444e-8 , 6.675873720496384e-8 , 1.1355886009312144e-8 , 3.132159633878473e-7 , 3.12451788886392e-8 , 1.502181845580708e-7 , 1.3461754377885882e-8 , 1.8882955998833495e-7 , 4.645742279762999e-8 , 4.6453880742092224e-8 , 7.714453964524637e-9 , 3.5857155467056145e-8 , 7.60832108426257e-9 , 4.221501370693659e-8 , 4.3407251126836854e-9 , 1.340157496088068e-8 , 8.565600495558101e-8 , 1.7045413969185574e-8 , 5.4221903411644234e-8 , 3.021912675649219e-8 , 6.153376119755194e-8 , 3.938857240370908e-9 , 4.135628017820636e-8 , 1.781920389021252e-8 , 4.3105885083605244e-8 , 3.903354972578654e-9 , 7.663085455078544e-8 , 1.1890405993142394e-8 , 9.304217840622186e-9 , 1.0968062014171664e-9 , 1.0536767902635802e-8 , 1.1516804221400889e-7 , 8.134522886393825e-7 , 5.952623993721318e-8 , 2.806350174466843e-8 , 1.2833099027886874e-8 , 1.0605690192733164e-7 , 7.872949936427176e-7 , 2.7501393162765453e-8 , 3.936289072470345e-9 , 2.0519442145428002e-8 , 7.394815870753746e-9 , 3.598397313453461e-8 , 2.5378517065632877e-8 , 4.698972233541099e-8 , 7.54952989012736e-9 , 6.322805461422831e-7 , 5.582006412652163e-9 , 1.29640980617296e-7 , 1.5874988434916304e-8 , 3.3837810775594335e-8 , 6.474512037613067e-9 , 9.121148281110436e-8 , 1.3918511676536127e-8 , 8.230025549949005e-9 , 2.7061290097663004e-8 , 2.6095918315149902e-8 , 5.722363471960534e-9 , 6.963475698285038e-7 , 4.685091781198025e-8 , 9.590579885809802e-9 , 2.099205858030473e-7 , 3.082160660028421e-8 , 3.563162565001221e-8 , 7.326312925215461e-7 , 0.0000021759731225756695 , 2.407518309155421e-7 , 2.974515780351794e-7 , 2.529018416908002e-8 , 7.667950718825978e-9 , 2.663289251358947e-7 , 3.4358880185436647e-8 , 2.3130198201215535e-8 , 3.1239693498719134e-8 , 2.8691621878351725e-7 , 3.895845068768722e-8 , 2.4184130253956937e-8 , 1.1582445225144511e-8 , 5.1545349322168477e-8 , 2.034345492063494e-8 , 8.201963197507212e-8 , 1.164153573540716e-8 , 5.496356720868789e-7 , 1.1682151246361627e-8 , 4.7576914852243135e-8 , 1.6349824605299546e-8 , 4.090862759653646e-8 , 2.1271189609706198e-7 , 1.6697286753242224e-7 , 3.989708119433999e-8 , 0.000002852450279533514 , 1.2500372292834072e-7 , 2.4846613655427063e-7 , 1.245429093188477e-8 , 2.9700272463628608e-8 , 4.250991558762962e-9 , 1.61443480806156e-7 , 2.6386018703306036e-7 , 7.638056409575711e-9 , 3.4455793773702226e-9 , 7.273289526210647e-8 , 1.7631434090503717e-8 , 7.58661311550668e-9 , 2.1547013062672704e-8 , 1.2675349125856883e-7 , 2.5637149292379036e-8 , 3.500976220038865e-8 , 6.472243541111311e-8 , 8.387915251262257e-9 , 3.069512288789156e-8 , 7.520387867998579e-8 , 1.5724964441687916e-7 , 1.9634005354873807e-7 , 1.2290831818972947e-7 , 1.112118730439704e-9 , 1.546895944670723e-8 , 9.91701032404535e-9 , 6.882473257974198e-7 , 8.267616635748709e-8 , 4.469531234008173e-8 , 2.075201344098332e-8 , 8.649378457903367e-8 , 5.202766573120243e-8 , 4.5564942041664835e-8 , 2.0319955496006514e-8 , 8.705182352741758e-9 , 6.452066969586667e-8 , 2.1777438519166026e-8 , 1.030954166481024e-8 , 3.211904342492744e-8 , 2.3336936294526822e-7 , 8.054096056753224e-9 , 1.9623354319264763e-7 , 1.2888089884199871e-7 , 1.5392496166555247e-8 , 1.401903038100727e-9 , 5.696818305978013e-8 , 6.080025372057207e-9 , 1.0782793324892737e-8 , 2.4260730313585555e-8 , 1.9388659566743627e-8 , 2.2970310453729326e-7 , 1.9971754028347277e-8 , 2.8477993296860404e-8 , 5.2273552597625894e-8 , 2.7392806600801123e-7 , 9.857291161097237e-8 , 3.12910977129377e-8 , 4.151442212219081e-8 , 5.251196366629074e-9 , 0.000001580681100676884 , 8.547603442821128e-7 , 1.068913135782168e-8 , 0.0000010621830597301596 , 7.737313012512459e-8 , 6.394216711669287e-8 , 1.1698345758759388e-7 , 1.0486609625104393e-7 , 2.1161000063329993e-7 , 1.53396815250062e-8 , 5.094453570109181e-8 , 1.4005379966874898e-8 , 2.6282036102998063e-8 , 8.778433624456738e-8 , 7.772066545896905e-9 , 4.228875383205377e-8 , 3.3243779284930497e-7 , 7.729244799747903e-8 , 7.636901111496286e-10 , 5.989500806435899e-8 , 1.326090597331131e-7 , 1.2853634245857393e-7 , 8.844242671557367e-9 , 1.0194374766570036e-7 , 2.493779334145074e-7 , 1.6547971881664125e-7 , 1.1762754326127833e-8 , 1.1496195639892903e-7 , 2.9342709240154363e-7 , 1.326124099421122e-8 , 8.630262726683213e-8 , 5.7394842656322e-8 , 1.1094081031615133e-7 , 2.2933713239581266e-7 , 3.4706170026765903e-7 , 1.4751107357824367e-7 , 1.502495017291494e-8 , 6.454319390059027e-8 , 5.164533689594464e-8 , 6.23741556182722e-8 , 1.293601457064142e-7 , 1.4052071506398534e-8 , 5.386946000385251e-8 , 2.0827554791935654e-8 , 1.3040637902861363e-8 , 1.0578981601838677e-7 , 1.5079727688771527e-8 , 8.92632726845477e-7 , 4.6374381668101705e-8 , 7.481006036869076e-7 , 5.883147302654379e-9 , 2.8707685117979054e-9 , 8.381598490814213e-7 , 7.341958596640552e-9 , 1.4245998158912698e-8 , 1.0926417104428765e-7 , 1.1308178216040687e-7 , 2.52339901862797e-7 , 1.1782835684925885e-7 , 4.6678056975224536e-8 , 2.7959197179683315e-9 , 3.4363861090014325e-8 , 1.4674496640054713e-7 , 3.5396915620822256e-8 , 2.0581127557761647e-7 , 7.18387909159901e-8 , 2.7693943138729082e-8 , 4.5493386835460115e-8 , 1.9559182717898693e-8 , 1.5359708172013598e-8 , 1.2336623278486059e-8 , 2.9570605519779747e-8 , 2.877552560676122e-7 , 9.051845495378075e-7 , 2.3732602016934834e-7 , 1.6521676471370483e-8 , 1.5478875070584763e-8 , 3.526786329643983e-8 , 3.616410637619083e-8 , 1.61590953950963e-8 , 7.65007328595857e-8 , 1.9661483108279754e-8 , 4.917534823789538e-8 , 1.1712612746350715e-7 , 1.0889253054813253e-8 , 0.000001494120169809321 , 1.018585660261806e-8 , 3.7575969003000864e-8 , 2.097097784314883e-8 , 3.368558054717141e-8 , 4.845588819080149e-9 , 6.039624622644624e-7 , 1.037331109898787e-8 , 2.841650257323636e-7 , 4.4990630954089283e-7 , 3.463186004637464e-8 , 7.720684180867465e-8 , 1.471122175189521e-7 , 1.1601575522490748e-7 , 4.007488030310924e-7 , 3.025649775167949e-8 , 6.706784461130155e-8 , 2.0128741340386114e-8 , 1.5987744461654074e-9 , 4.1919822280078733e-8 , 1.3167154477855547e-8 , 3.231814815762846e-8 , 9.247659704669786e-8 , 1.3075300842047e-7 , 1.0574301256838226e-7 , 3.762165334819656e-8 , 1.0942246575496029e-7 , 7.001474955359299e-8 , 2.742706151082075e-8 , 2.0766625752344225e-8 , 4.5403403703403455e-8 , 3.39040298058535e-8 , 1.0469661759771043e-7 , 2.8271578855765256e-8 , 3.406226767310727e-7 , 5.146206945028098e-7 , 6.740708613506285e-7 , 6.382248063374618e-9 , 3.63878704945364e-8 , 3.626059807970705e-8 , 1.6065602892467723e-7 , 3.639055989879125e-7 , 6.232691696084203e-9 , 4.805490050330263e-8 , 3.372633727849461e-8 , 6.328880317596486e-7 , 6.480631498106959e-8 , 2.1165197949812864e-7 , 8.38779143919055e-8 , 1.7589144363228115e-8 , 2.729027670511641e-9 , 2.144795097080987e-8 , 7.861271456022223e-8 , 2.0118186228046397e-8 , 2.8407685093156942e-8 , 2.4922530883486615e-7 , 2.0156670998972004e-8 , 2.6551649767725394e-8 , 2.7848242822869906e-8 , 6.907123761834555e-9 , 1.880543720744754e-8 , 1.3006903998302732e-8 , 3.685918272822164e-7 , 3.967941211158177e-7 , 2.7592133022835696e-8 , 2.5228947819755376e-8 , 1.547002881352455e-7 , 3.689306637966183e-8 , 1.440177199718562e-9 , 2.1504929392790473e-8 , 5.068111263994979e-8 , 5.081711407228795e-8 , 1.171875219085905e-8 , 5.409278358570191e-8 , 7.138276600926474e-7 , 2.5237213208129106e-7 , 7.072044638789521e-8 , 7.199763984999663e-8 , 1.2525473103153217e-8 , 3.4803417747752974e-7 , 1.9591827538079087e-7 , 1.2404700555634918e-7 , 1.234617457157583e-7 , 1.9201337408958352e-8 , 1.9895249181445251e-7 , 3.7876677794201896e-8 , 1.0629785052174157e-8 , 1.2437127772102485e-8 , 2.1861892207653e-7 , 2.6181456291851646e-7 , 1.112900775979142e-7 , 1.0776630432474121e-7 , 6.380325157095967e-9 , 3.895085143312826e-9 , 1.5762756788717525e-7 , 2.909027019271093e-9 , 1.0381050685737137e-8 , 2.8135211493918177e-8 , 1.0778002490496874e-8 , 1.3605974125141529e-8 , 2.9236465692861202e-8 , 1.9189795352758665e-7 , 2.199506354827463e-7 , 1.326399790002597e-8 , 4.9004846403022384e-8 , 2.980837132682268e-9 , 8.926045680368588e-9 , 1.0996975774446582e-8 , 7.71560149104289e-9 , 7.454491246505768e-9 , 5.086162246925596e-8 , 1.5129764108223753e-7 , 1.1960075596562092e-8 , 1.1323334270230134e-8 , 9.391332156383214e-9 , 9.585701832293125e-8 , 1.905532798218701e-8 , 1.8105303922766325e-8 , 6.179227796110354e-8 , 6.389401363549041e-8 , 1.1853179771037503e-8 , 9.37277544466042e-9 , 1.2332148457971925e-7 , 1.6522022860954166e-8 , 1.246116454467483e-7 , 4.196171854431441e-9 , 3.996593278543514e-8 , 1.2554556505506298e-8 , 1.4302138140465104e-8 , 6.631793780798034e-9 , 5.964224669696705e-9 , 5.556936244488497e-9 , 1.4192455921602232e-7 , 1.7613080771639034e-8 , 3.380189639301534e-7 , 7.85651934620546e-8 , 2.966783085867064e-8 , 0.0000028992105853831163 , 0.0000013787366697215475 , 5.313622430946907e-9 , 2.512852859126724e-8 , 8.406627216572815e-8 , 4.492839167369311e-8 , 5.408793057881667e-8 , 2.4239175999696272e-8 , 4.016805235096399e-7 , 4.1083545454512205e-8 , 5.4153481698904216e-8 , 8.640767212853007e-9 , 5.773256717134245e-8 , 2.6443152023603034e-7 , 8.953217047746875e-7 , 2.7994001783326894e-8 , 5.889480014786841e-9 , 4.1788819515886644e-8 , 2.8880645430717777e-8 , 2.135752907861388e-8 , 2.3024175277441827e-7 , 8.786625471657317e-8 , 2.0697297209437693e-9 , 2.236410523437371e-8 , 3.203276310870251e-9 , 1.176874686592555e-8 , 6.963571053120177e-8 , 2.271932153519174e-8 , 7.360382525689602e-9 , 6.922528772435044e-9 , 3.213871480056696e-8 , 1.370577820125618e-7 , 1.9815049157045905e-8 , 1.0578956377571558e-8 , 2.7049420481262132e-8 , 2.9755937713815683e-9 , 2.1773699288019088e-8 , 1.09755387001087e-8 , 1.991872444762066e-8 , 2.3882098076910552e-8 , 2.1357365653784655e-8 , 6.109098560358461e-9 , 1.1890497475519624e-8 , 1.1459891702259029e-8 , 3.73173456580389e-8 , 1.572620256240498e-8 , 3.404023374287135e-8 , 3.6921580459647885e-8 , 9.281765045443535e-8 , 1.2323201303843234e-7 , 4.2347593876002065e-8 , 1.7423728237986325e-8 , 5.8113389656000436e-8 , 3.931436154402945e-8 , 2.3690461148362374e-8 , 1.792850135018398e-8 , 1.440664210150544e-8 , 7.019830494670032e-9 , 6.041522482291839e-8 , 4.867479930226182e-8 , 1.0685319296044327e-8 , 1.0051243393149889e-8 , 4.2426261614991745e-8 , 2.607815297039906e-8 , 5.136670200300841e-9 , 1.69729952315123e-9 , 1.9131586981302462e-8 , 2.111743526711507e-7 , 1.337269672774255e-8 , 2.0002481448955223e-8 , 1.0454256482717028e-7 , 2.8144228281234973e-8 , 2.1344791889532644e-7 , 2.1046110632028103e-8 , 1.9114453664315079e-7 , 3.957693550660224e-8 , 2.931631826186276e-8 , 1.105203111251285e-7 , 4.84007678380749e-8 , 5.583606110803885e-8 , 1.2130111315400427e-7 , 1.77621615193857e-8 , 2.5610853882085394e-8 , 1.203865309662433e-7 , 4.674859610531712e-9 , 1.5916098661250544e-8 , 3.147594185293201e-8 , 6.147686093527227e-8 , 2.204641802450169e-8 , 3.257763410147163e-7 , 1.198914532096751e-7 , 2.3818989802748547e-7 , 1.4909986134625797e-8 , 5.10168831624469e-8 , 5.5142201915714395e-8 , 2.288550327023131e-8 , 5.714110073995471e-8 , 5.185095801607531e-7 , 4.977285783525076e-8 , 1.1049896109227575e-8 , 1.264099296349741e-7 , 8.174881571676451e-8 ]}]} Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f paddle-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = jay-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can use the example payload jay-v2-grpc.json as the sample input to test the model. Notice that the input format differs from the in the previous REST endpoint example. ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Tue, 10 Oct 2023 14 :55:27 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 190 Estimated response size: 4093 bytes Response contents: { \"modelName\" : \"paddle-v2-resnet50-grpc\" , \"id\" : \"97db0c56-95d2-4171-afd5-f7609a87032d\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"1000\" ] , \"contents\" : { \"fp32Contents\" : [ 6 .7366788e-9,1.1535991e-8,5.1422507e-8,6.6471706e-8,4.0944926e-8,1.3402452e-7,9.355561e-8,2.8935892e-8,6.845367e-8,7.680616e-8,0.000002033469,0.0000011085679,2.3477592e-7,6.582037e-7,0.00012373104,4.2878804e-7,0.00000641996,0.99934965,0.000073720024,0.000031011357,0.0000056028093,0.0000021862509,1.9544045e-8,3.728894e-7,4.2903633e-7,1.825118e-7,7.159926e-8,9.231618e-9,6.4692415e-7,7.0316903e-9,4.4512316e-8,1.2455972e-7,9.4463275e-8,4.3477044e-8,4.65822e-7,6.797721e-8,2.1060276e-7,2.2605123e-8,1.431149e-7,7.9512986e-8,1.2341783e-7,0.0000010921714,0.000015243892,3.1173343e-7,2.4152058e-7,6.8637625e-8,8.467682e-8,9.424677e-8,1.021921e-8,3.3770753e-8,3.6928835e-8,1.3694032e-7,1.06742846e-7,2.5994837e-7,3.4866406e-7,3.1320535e-8,3.5748732e-7,6.648439e-8,3.1638956e-7,0.0000012095878,8.66409e-8,4.0144172e-8,1.2544761e-7,3.320118e-8,1.9731445e-7,3.8064056e-7,1.3827865e-7,2.300226e-8,7.144225e-8,2.8511145e-8,2.9825674e-8,8.936033e-8,6.2238837e-7,6.478839e-8,1.3663023e-7,9.9731814e-8,2.5761555e-8,4.13022e-8,3.9384464e-8,1.215808e-7,0.0000043028217,0.0000018179063,0.0000018520155,0.0000016246107,0.000016448314,0.000010544916,0.000003993062,2.6464798e-7,0.000019193476,4.803243e-7,1.696285e-7,0.0000045505058,0.000042359294,0.0000044433386,0.0000051040097,0.0000135063965,4.1758724e-7,4.4944915e-7,3.1566984e-7,0.00000105576,1.3364639e-8,1.389366e-8,6.7703795e-8,1.4129697e-7,7.170519e-8,7.9344666e-8,2.6391543e-8,2.6134321e-8,7.196726e-9,2.1752363e-8,6.68464e-8,3.417796e-8,1.6228276e-7,4.1071146e-7,6.4721354e-7,2.9513794e-7,5.653474e-9,4.8301445e-8,8.887482e-9,3.730617e-8,1.7784265e-8,4.641905e-9,3.4131187e-8,1.9373938e-7,0.0000012980177,3.5641005e-8,2.1493324e-8,3.0552937e-7,1.5532517e-7,0.0000014520979,3.4884646e-8,0.000038254384,4.5088433e-7,4.176697e-7,6.7706225e-7,1.4142249e-7,0.000014235998,6.2938204e-7,0.000004762867,9.0249006e-7,9.058988e-7,0.0000015713684,1.5720647e-7,1.8185365e-7,7.1931886e-8,0.0000011952825,8.8748374e-7,2.0870831e-7,9.906239e-8,7.793622e-9,1.00584984e-7,4.205944e-7,1.8436246e-7,1.6437947e-7,7.025353e-8,2.5704486e-7,7.5868776e-8,7.841314e-7,2.4953093e-7,5.157682e-8,4.0674127e-8,7.5317965e-9,4.7974854e-8,1.7419973e-8,1.7958679e-7,1.2566392e-8,8.97544e-8,3.2696548e-8,1.120836e-7,3.9067462e-8,4.6769046e-8,1.8523554e-7,1.4833053e-7,1.227935e-7,0.0000010729105,3.653849e-9,1.6198403e-7,1.619072e-8,1.2004934e-7,1.48002774e-8,4.0229484e-8,2.150609e-7,1.1925697e-7,4.8982514e-8,7.60892e-8,2.313748e-8,8.52105e-8,9.5862134e-8,1.3351651e-7,3.021699e-8,4.4238764e-8,2.610667e-8,2.397709e-7,1.3192565e-7,1.6734932e-8,1.588337e-7,4.0643516e-7,8.7534545e-8,8.3669994e-7,3.4375987e-8,7.847893e-8,8.526395e-9,9.601383e-8,5.258924e-7,1.3557448e-7,1.0307227e-7,1.04298135e-8,5.1877144e-8,2.1870013e-8,1.179144e-8,2.9806564e-8,4.3383935e-8,2.9991046e-8,2.850761e-8,3.058665e-8,6.441099e-8,1.5364101e-8,1.5973884e-8,2.573685e-8,1.0903766e-7,3.2118738e-8,6.819743e-9,1.9251311e-7,5.825811e-8,1.8765762e-7,4.007079e-7,1.5791578e-8,1.9501584e-7,1.0142063e-8,2.744815e-8,1.2843532e-8,3.7297493e-8,7.407497e-8,4.2060783e-8,1.6924805e-8,1.4592035e-7,4.344977e-8,1.7191404e-7,3.5817443e-8,8.440249e-9,4.1948297e-8,2.5140324e-8,2.83402e-8,8.747196e-8,8.277126e-9,1.1676294e-8,1.4548515e-7,7.200282e-9,0.000002623601,5.675737e-7,0.0000019483527,6.752595e-8,8.168475e-8,1.09330465e-7,1.6709137e-7,3.1387277e-8,2.9735245e-8,5.752164e-8,5.8508775e-8,3.2544622e-7,3.3302214e-8,4.1867867e-7,1.5085907e-7,2.334682e-7,2.8640278e-7,2.294032e-7,1.8537604e-7,3.1517982e-7,0.0000011075967,1.5369783e-7,1.923751e-7, 1 .6404466e-7,2.9008353e-7,1.2466549e-7,5.802622e-8,5.1862205e-8,6.0094205e-9,1.2333241e-7,1.3798474e-7,1.7370232e-7,5.617761e-7,5.160447e-8,4.8132776e-8,8.0326984e-8,0.0000020645264,5.6385977e-7,8.7942e-7,0.000003478598,2.9723893e-7,3.3904533e-7,9.469074e-8,3.7548457e-8,1.5679038e-7,8.203105e-8,6.8479626e-9,1.8251624e-8,6.050241e-8,3.9563428e-8,1.0699948e-7,3.2566635e-7,3.536943e-7,7.326295e-8,4.857656e-7,7.7177134e-7,3.456778e-8,3.2462046e-7,0.0000031608602,5.3309947e-8,3.6456873e-7,5.4815894e-7,4.6230696e-8,1.3466178e-7,4.3529482e-8,1.6404105e-7,2.4636954e-8,5.9587126e-8,9.493651e-8,5.5234626e-8,5.7412358e-8,0.00001185035,0.0000058263945,0.0000074208674,9.127966e-7,0.0000020019581,0.000001033499,3.514685e-8,0.0000020589953,3.565551e-7,6.873234e-8,2.1935298e-9,5.5603635e-8,3.3266997e-7,1.3073692e-7,2.718763e-8,1.0462929e-8,7.4666804e-7,6.923166e-8,1.6145664e-8,8.568521e-9,4.76221e-9,1.2339771e-7,8.340629e-9,3.2649041e-9,5.063249e-9,4.0704995e-9,1.2043539e-8,5.1056084e-9,7.267143e-9,1.1845163e-7,7.535579e-8,6.386964e-8,1.6212937e-8,2.6104294e-7,6.9794254e-7,6.647513e-8,7.717493e-7,6.651207e-7,3.3244953e-7,3.707282e-7,3.9956424e-7,6.411632e-8,7.107352e-8,1.6380017e-7,6.876801e-8,3.4624745e-7,2.0256503e-7,6.1961015e-7,2.6841073e-8,6.7203354e-7,0.0000011348341,0.0000018397932,6.3972516e-7,7.257533e-8,4.221391e-7,3.9657925e-7,1.403744e-7,3.2498568e-7,1.5857655e-7,1.1122217e-7,7.391421e-8,3.4232224e-7,5.3979615e-8,8.5172964e-8,0.00000406101,0.0000144787555,7.3170328e-9,6.948496e-9,4.4689173e-8,9.231412e-8,5.4119823e-8,2.2242811e-7,1.7609555e-8,2.090628e-8,3.6797683e-9,6.17792e-8,1.7920289e-7,2.627918e-8,2.69882e-8,1.6432807e-7,1.2827613e-7,4.4689088e-8,6.316553e-8,1.946176e-8,2.0871258e-8,2.241458e-8,2.4765244e-8,6.7853985e-9,2.4248795e-8,4.5549795e-9,2.8977038e-8,2.0402325e-8,1.6009503e-7,2.019971e-7,1.6111885e-8,5.964114e-8,4.0983186e-9,3.9080128e-8,7.511338e-9,5.965624e-7,1.6478224e-7,1.4106989e-8,3.2855585e-8,3.3387166e-9,1.2200434e-8,4.624639e-8,6.8423094e-9,1.7426288e-8,4.661133e-8,9.331948e-8,1.2306079e-7,1.2359445e-8,1.1173199e-8,2.7724862e-8,2.4192101e-7,3.4511868e-7,2.593767e-8,9.964568e-8,9.79781e-9,1.9085564e-7,3.9727063e-8,2.6639205e-8,6.874149e-9,3.1469938e-8,2.4086594e-7,1.3126927e-7,2.125434e-7,2.0502034e-8,3.694976e-8,6.563176e-7,2.5600501e-8,2.6882981e-8,6.880636e-7,2.0092733e-7,2.7880397e-8,2.6284091e-8,5.1678345e-8,1.8935414e-7,4.6185284e-7,1.1086778e-8,1.4542604e-7,2.873701e-8,6.105168e-7,1.2016463e-8,1.3944705e-7,2.0937128e-8,4.380141e-8,1.9663208e-8,6.654449e-9,1.1149591e-8,6.424939e-8,6.971555e-9,3.2600196e-9,1.42601895e-8,2.7895078e-8,8.115783e-8,2.5995716e-8,2.2855579e-8,1.05596285e-7,8.1455426e-8,3.7793686e-8,4.8818915e-8,2.3420624e-8,1.0599355e-8,3.604105e-8,5.062431e-8,3.680444e-8,1.5015802e-9,0.0000014475033,0.0000010762104,1.3049913e-7,3.0736015e-8,1.7184021e-8,2.042109e-8,7.904992e-9,1.6902052e-7,1.2416507e-8,5.4758292e-8,2.6250422e-8,1.3261367e-8,6.2980746e-8,1.2709986e-8,2.0171682e-7,4.3866372e-8,6.9623496e-8,2.956512e-7,7.9251316e-7,2.086892e-7,1.7341794e-7,4.2942418e-8,4.213407e-9,8.824785e-8,1.734157e-8,7.321587e-8,1.7941774e-8,1.1245148e-7,4.2424054e-7,8.2595735e-9,1.1336403e-7,8.268799e-8,2.2186978e-8,1.9539721e-8,1.0675704e-8,3.2885175e-8,2.4340963e-8,6.639137e-8,5.6046874e-9,1.3866047e-8,6.675874e-8,1.1355886e-8,3.1321596e-7,3.124518e-8,1.5021818e-7,1.3461754e-8,1.8882956e-7,4.6457423e-8,4.645388e-8,7.714454e-9,3.5857155e-8,7.608321e-9,4.2215014e-8,4.340725e-9,1.3401575e-8,8.5656005e-8,1.7045414e-8,5.4221903e-8,3.0219127e-8,6.153376e-8,3.9388572e-9, 4 .135628e-8,1.7819204e-8,4.3105885e-8,3.903355e-9,7.6630855e-8,1.1890406e-8,9.304218e-9,1.0968062e-9,1.0536768e-8,1.1516804e-7,8.134523e-7,5.952624e-8,2.8063502e-8,1.2833099e-8,1.060569e-7,7.87295e-7,2.7501393e-8,3.936289e-9,2.0519442e-8,7.394816e-9,3.5983973e-8,2.5378517e-8,4.6989722e-8,7.54953e-9,6.3228055e-7,5.5820064e-9,1.2964098e-7,1.5874988e-8,3.383781e-8,6.474512e-9,9.121148e-8,1.3918512e-8,8.2300255e-9,2.706129e-8,2.6095918e-8,5.7223635e-9,6.9634757e-7,4.6850918e-8,9.59058e-9,2.0992059e-7,3.0821607e-8,3.5631626e-8,7.326313e-7,0.0000021759731,2.4075183e-7,2.9745158e-7,2.5290184e-8,7.667951e-9,2.6632893e-7,3.435888e-8,2.3130198e-8,3.1239693e-8,2.8691622e-7,3.895845e-8,2.418413e-8,1.1582445e-8,5.154535e-8,2.0343455e-8,8.201963e-8,1.1641536e-8,5.496357e-7,1.1682151e-8,4.7576915e-8,1.6349825e-8,4.0908628e-8,2.127119e-7,1.6697287e-7,3.989708e-8,0.0000028524503,1.2500372e-7,2.4846614e-7,1.2454291e-8,2.9700272e-8,4.2509916e-9,1.6144348e-7,2.638602e-7,7.638056e-9,3.4455794e-9,7.2732895e-8,1.7631434e-8,7.586613e-9,2.1547013e-8,1.2675349e-7,2.563715e-8,3.5009762e-8,6.4722435e-8,8.387915e-9,3.0695123e-8,7.520388e-8,1.5724964e-7,1.9634005e-7,1.2290832e-7,1.1121187e-9,1.546896e-8,9.91701e-9,6.882473e-7,8.2676166e-8,4.4695312e-8,2.0752013e-8,8.6493785e-8,5.2027666e-8,4.5564942e-8,2.0319955e-8,8.705182e-9,6.452067e-8,2.1777439e-8,1.0309542e-8,3.2119043e-8,2.3336936e-7,8.054096e-9,1.9623354e-7,1.288809e-7,1.5392496e-8,1.401903e-9,5.6968183e-8,6.0800254e-9,1.0782793e-8,2.426073e-8,1.938866e-8,2.297031e-7,1.9971754e-8,2.8477993e-8,5.2273553e-8,2.7392807e-7,9.857291e-8,3.1291098e-8,4.1514422e-8,5.2511964e-9,0.0000015806811,8.5476034e-7,1.0689131e-8,0.0000010621831,7.737313e-8,6.394217e-8,1.1698346e-7,1.04866096e-7,2.1161e-7,1.5339682e-8,5.0944536e-8,1.400538e-8,2.6282036e-8,8.7784336e-8,7.7720665e-9,4.2288754e-8,3.324378e-7,7.729245e-8,7.636901e-10,5.989501e-8,1.3260906e-7,1.2853634e-7,8.844243e-9,1.0194375e-7,2.4937793e-7,1.6547972e-7,1.1762754e-8,1.14961956e-7,2.934271e-7,1.3261241e-8,8.630263e-8,5.7394843e-8,1.1094081e-7,2.2933713e-7,3.470617e-7,1.4751107e-7,1.502495e-8,6.4543194e-8,5.1645337e-8,6.2374156e-8,1.2936015e-7,1.40520715e-8,5.386946e-8,2.0827555e-8,1.3040638e-8,1.05789816e-7,1.5079728e-8,8.926327e-7,4.637438e-8,7.481006e-7,5.8831473e-9,2.8707685e-9,8.3815985e-7,7.3419586e-9,1.4245998e-8,1.0926417e-7,1.1308178e-7,2.523399e-7,1.1782836e-7,4.6678057e-8,2.7959197e-9,3.436386e-8,1.4674497e-7,3.5396916e-8,2.0581128e-7,7.183879e-8,2.7693943e-8,4.5493387e-8,1.9559183e-8,1.5359708e-8,1.2336623e-8,2.9570606e-8,2.8775526e-7,9.0518455e-7,2.3732602e-7,1.6521676e-8,1.5478875e-8,3.5267863e-8,3.6164106e-8,1.6159095e-8,7.650073e-8,1.9661483e-8,4.917535e-8,1.1712613e-7,1.0889253e-8,0.0000014941202,1.0185857e-8,3.757597e-8,2.0970978e-8,3.368558e-8,4.845589e-9,6.0396246e-7,1.0373311e-8,2.8416503e-7,4.499063e-7,3.463186e-8,7.720684e-8,1.4711222e-7,1.16015755e-7,4.007488e-7,3.0256498e-8,6.7067845e-8,2.0128741e-8,1.5987744e-9,4.1919822e-8,1.31671545e-8,3.2318148e-8,9.24766e-8,1.3075301e-7,1.0574301e-7,3.7621653e-8,1.09422466e-7,7.001475e-8,2.7427062e-8,2.0766626e-8,4.5403404e-8,3.390403e-8,1.0469662e-7,2.8271579e-8,3.4062268e-7,5.146207e-7,6.7407086e-7,6.382248e-9,3.638787e-8,3.6260598e-8,1.6065603e-7,3.639056e-7,6.2326917e-9,4.80549e-8,3.3726337e-8,6.3288803e-7,6.4806315e-8,2.1165198e-7,8.3877914e-8,1.7589144e-8,2.7290277e-9,2.1447951e-8,7.8612715e-8,2.0118186e-8,2.8407685e-8,2.492253e-7,2.0156671e-8,2.655165e-8,2.7848243e-8,6.9071238e-9,1.8805437e-8,1.3006904e-8,3.6859183e-7,3.9679412e-7,2.7592133e-8,2.5228948e-8,1.5470029e-7,3.6893066e-8,1.4401772e-9,2.150493e-8,5.0681113e-8,5.0817114e-8,1.1718752e-8,5.4092784e-8,7.1382766e-7,2.5237213e-7,7.0720446e-8,7.199764e-8,1.2525473e-8,3.4803418e-7,1.9591828e-7,1.24047e-7,1.2346175e-7,1.9201337e-8,1.9895249e-7,3.7876678e-8,1.0629785e-8,1.2437128e-8,2.1861892e-7,2.6181456e-7,1.1129008e-7,1.07766304e-7,6.380325e-9,3.895085e-9,1.5762757e-7,2.909027e-9,1.0381051e-8,2.8135211e-8,1.07780025e-8,1.3605974e-8,2.9236466e-8,1.9189795e-7,2.1995064e-7,1.3263998e-8,4.9004846e-8,2.9808371e-9,8.926046e-9,1.0996976e-8,7.7156015e-9,7.454491e-9,5.0861622e-8,1.5129764e-7,1.1960076e-8,1.1323334e-8,9.391332e-9,9.585702e-8,1.9055328e-8,1.8105304e-8,6.179228e-8,6.3894014e-8,1.185318e-8,9.3727754e-9,1.2332148e-7,1.6522023e-8,1.2461165e-7,4.196172e-9,3.9965933e-8,1.25545565e-8,1.4302138e-8,6.631794e-9,5.9642247e-9,5.5569362e-9,1.4192456e-7,1.761308e-8,3.3801896e-7,7.856519e-8,2.966783e-8,0.0000028992106,0.0000013787367,5.3136224e-9,2.5128529e-8,8.406627e-8,4.492839e-8,5.408793e-8,2.4239176e-8,4.0168052e-7,4.1083545e-8,5.415348e-8,8.640767e-9,5.7732567e-8,2.6443152e-7,8.953217e-7,2.7994002e-8,5.88948e-9,4.178882e-8,2.8880645e-8,2.1357529e-8,2.3024175e-7,8.7866255e-8,2.0697297e-9,2.2364105e-8,3.2032763e-9,1.1768747e-8,6.963571e-8,2.2719322e-8,7.3603825e-9,6.9225288e-9,3.2138715e-8,1.3705778e-7,1.981505e-8,1.0578956e-8,2.704942e-8,2.9755938e-9,2.17737e-8,1.0975539e-8,1.9918724e-8,2.3882098e-8,2.1357366e-8,6.1090986e-9,1.18904975e-8,1.1459892e-8,3.7317346e-8,1.5726203e-8,3.4040234e-8,3.692158e-8,9.281765e-8,1.2323201e-7,4.2347594e-8,1.7423728e-8,5.811339e-8,3.931436e-8,2.3690461e-8,1.7928501e-8,1.4406642e-8,7.0198305e-9,6.0415225e-8,4.86748e-8,1.0685319e-8,1.0051243e-8,4.242626e-8,2.6078153e-8,5.13667e-9,1.6972995e-9,1.9131587e-8,2.1117435e-7,1.3372697e-8,2.0002481e-8,1.04542565e-7,2.8144228e-8,2.1344792e-7,2.104611e-8,1.9114454e-7,3.9576936e-8,2.9316318e-8,1.1052031e-7,4.8400768e-8,5.583606e-8,1.2130111e-7,1.7762162e-8,2.5610854e-8,1.2038653e-7,4.6748596e-9,1.5916099e-8,3.1475942e-8,6.147686e-8,2.2046418e-8,3.2577634e-7,1.1989145e-7,2.381899e-7,1.4909986e-8,5.1016883e-8,5.5142202e-8,2.2885503e-8,5.71411e-8,5.185096e-7,4.9772858e-8,1.1049896e-8,1.2640993e-7,8.1748816e-8 ]} } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Paddle"},{"location":"modelserving/v1beta1/paddle/#deploy-paddle-model-with-inferenceservice","text":"In this example, we use a trained paddle resnet50 model to classify images by running an inference service with Paddle predictor.","title":"Deploy Paddle model with InferenceService"},{"location":"modelserving/v1beta1/paddle/#deploy-paddle-model-with-v1-protocol","text":"","title":"Deploy Paddle model with V1 protocol"},{"location":"modelserving/v1beta1/paddle/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : model : modelFormat : name : paddle storageUri : \"gs://kfserving-examples/models/paddle/resnet\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : paddle : storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the above yaml to create the InferenceService kubectl apply -f paddle.yaml Expected Output $ inferenceservice.serving.kserve.io/paddle-resnet50 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/paddle/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload jay.json as the sample input to test the model. MODEL_NAME = paddle-resnet50 SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Expected Output * Trying 127 .0.0.1:80... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 80 ( #0) > POST /v1/models/paddle-resnet50:predict HTTP/1.1 > Host: paddle-resnet50.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3010209 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23399 < content-type: application/json ; charset = UTF-8 < date: Mon, 17 May 2021 03 :34:58 GMT < server: istio-envoy < x-envoy-upstream-service-time: 511 < { \"predictions\" : [[ 6 .736678770380422e-09, 1 .1535990829258935e-08, 5 .142250714129659e-08, 6 .647170636142619e-08, 4 .094492567219277e-08, 1 .3402451770616608e-07, 9 .355561303436843e-08, 2 .8935891904779965e-08, 6 .845367295227334e-08, 7 .680615965455218e-08, 2 .0334689452283783e-06, 1 .1085678579547675e-06, 2 .3477592492326949e-07, 6 .582037030966603e-07, 0 .00012373103527352214, 4 .2878804151769145e-07, 6 .419959845516132e-06, 0 .9993496537208557, 7 .372002437477931e-05, 3 .101135735050775e-05, 5 .6028093240456656e-06, 2 .1862508674530545e-06, 1 .9544044604913324e-08, 3 .728893887000595e-07, 4 .2903633357127546e-07, 1 .8251179767503345e-07, 7 .159925985433802e-08, 9 .231618136595898e-09, 6 .469241498052725e-07, 7 .031690341108288e-09, 4 .451231561120039e-08, 1 .2455971898361895e-07, 9 .44632745358831e-08, 4 .347704418705689e-08, 4 .658220120745682e-07, 6 .797721141538204e-08, 2 .1060276367279585e-07, 2 .2605123106700376e-08, 1 .4311490303953178e-07, 7 .951298641728499e-08, 1 .2341783417468832e-07, 1 .0921713737843675e-06, 1 .5243892448779661e-05, 3 .1173343018053856e-07, 2 .4152058131221565e-07, 6 .863762536113427e-08, 8 .467682022228473e-08, 9 .4246772164297e-08, 1 .0219210366813058e-08, 3 .3770753304906975e-08, 3 .6928835100979995e-08, 1 .3694031508748594e-07, 1 .0674284567357972e-07, 2 .599483650556067e-07, 3 .4866405940192635e-07, 3 .132053549848024e-08, 3 .574873232992104e-07, 6 .64843895492595e-08, 3 .1638955988455564e-07, 1 .2095878219042788e-06, 8 .66409024524728e-08, 4 .0144172430700564e-08, 1 .2544761318622477e-07, 3 .3201178695208e-08, 1 .9731444922399533e-07, 3 .806405572959193e-07, 1 .3827865075199952e-07, 2 .300225965257141e-08, 7 .14422512260171e-08, 2 .851114544455413e-08, 2 .982567437470607e-08, 8 .936032713791064e-08, 6 .22388370175031e-07, 6 .478838798784636e-08, 1 .3663023423760023e-07, 9 .973181391842445e-08, 2 .5761554667269593e-08, 4 .130220077058766e-08, 3 .9384463690339544e-08, 1 .2158079698565416e-07, 4 .302821707824478e-06, 1 .8179063090428826e-06, 1 .8520155435908237e-06, 1 .6246107179540559e-06, 1 .6448313544970006e-05, 1 .0544916221988387e-05, 3 .993061909568496e-06, 2 .646479799750523e-07, 1 .9193475964129902e-05, 4 .803242745765601e-07, 1 .696285067964709e-07, 4 .550505764200352e-06, 4 .235929372953251e-05, 4 .443338639248395e-06, 5 .104009687784128e-06, 1 .3506396498996764e-05, 4 .1758724478313525e-07, 4 .494491463447048e-07, 3 .156698369366495e-07, 1 .0557599807725637e-06, 1 .336463917311903e-08, 1 .3893659556174498e-08, 6 .770379457066156e-08, 1 .4129696523923485e-07, 7 .170518756538513e-08, 7 .934466594861078e-08, 2 .639154317307657e-08, 2 .6134321373660896e-08, 7 .196725881897237e-09, 2 .1752363466021052e-08, 6 .684639686227456e-08, 3 .417795824134373e-08, 1 .6228275967478112e-07, 4 .107114648377319e-07, 6 .472135396506928e-07, 2 .951379372007068e-07, 5 .653474133282543e-09, 4 .830144462175667e-08, 8 .887481861563629e-09, 3 .7306168820805397e-08, 1 .7784264727538357e-08, 4 .641905082536368e-09, 3 .413118676576232e-08, 1 .937393818707278e-07, 1 .2980176506971475e-06, 3 .5641004814124244e-08, 2 .149332445355867e-08, 3 .055293689158134e-07, 1 .5532516783878236e-07, 1 .4520978766086046e-06, 3 .488464628276233e-08, 3 .825438398052938e-05, 4 .5088432898410247e-07, 4 .1766969616219285e-07, 6 .770622462681786e-07, 1 .4142248971893423e-07, 1 .4235997696232516e-05, 6 .293820433711517e-07, 4 .762866865348769e-06, 9 .024900577969674e-07, 9 .058987870957935e-07, 1 .5713684433649178e-06, 1 .5720647184025438e-07, 1 .818536503606083e-07, 7 .193188622522939e-08, 1 .1952824934269302e-06, 8 .874837362782273e-07, 2 .0870831463071227e-07, 9 .906239029078279e-08, 7 .793621747964607e-09, 1 .0058498389753368e-07, 4 .2059440374941914e-07, 1 .843624630737395e-07, 1 .6437947181202617e-07, 7 .025352743994517e-08, 2 .570448600636155e-07, 7 .586877615040066e-08, 7 .841313731660193e-07, 2 .495309274763713e-07, 5 .157681925993529e-08, 4 .0674127177453556e-08, 7 .531796519799627e-09, 4 .797485431140558e-08, 1 .7419973019627832e-08, 1 .7958679165985814e-07, 1 .2566392371127222e-08, 8 .975440124459055e-08, 3 .26965476915575e-08, 1 .1208359751435637e-07, 3 .906746215420753e-08, 4 .6769045525252295e-08, 1 .8523553535487736e-07, 1 .4833052830454108e-07, 1 .2279349448363064e-07, 1 .0729105497375713e-06, 3 .6538490011395197e-09, 1 .6198403329781286e-07, 1 .6190719875908144e-08, 1 .2004933580556099e-07, 1 .4800277448046018e-08, 4 .02294837442696e-08, 2 .15060893538066e-07, 1 .1925696696835075e-07, 4 .8982514044837444e-08, 7 .608920071788816e-08, 2 .3137479487900237e-08, 8 .521050176568679e-08, 9 .586213423062873e-08, 1 .3351650807180704e-07, 3 .021699157557123e-08, 4 .423876376336011e-08, 2 .610667060309879e-08, 2 .3977091245797055e-07, 1 .3192564551900432e-07, 1 .6734931662654162e-08, 1 .588336999702733e-07, 4 .0643516285854275e-07, 8 .753454494581092e-08, 8 .366999395548191e-07, 3 .437598650180007e-08, 7 .847892646850596e-08, 8 .526394701391382e-09, 9 .601382799928615e-08, 5 .258924034023948e-07, 1 .3557448141909845e-07, 1 .0307226716577134e-07, 1 .0429813457335513e-08, 5 .187714435805901e-08, 2 .187001335585137e-08, 1 .1791439824548888e-08, 2 .98065643278278e-08, 4 .338393466696289e-08, 2 .9991046091026874e-08, 2 .8507610494443725e-08, 3 .058665143385042e-08, 6 .441099031917474e-08, 1 .5364101102477434e-08, 1 .5973883549236234e-08, 2 .5736850872704053e-08, 1 .0903765712555469e-07, 3 .2118737891551064e-08, 6 .819742992547617e-09, 1 .9251311300649832e-07, 5 .8258109447706374e-08, 1 .8765761922168167e-07, 4 .0070790419122204e-07, 1 .5791577823165426e-08, 1 .950158434738114e-07, 1 .0142063189277906e-08, 2 .744815041921811e-08, 1 .2843531571604672e-08, 3 .7297493094001766e-08, 7 .407496838141014e-08, 4 .20607833007125e-08, 1 .6924804668860816e-08, 1 .459203531339881e-07, 4 .344977000414474e-08, 1 .7191403856031684e-07, 3 .5817443233554513e-08, 8 .440249388286247e-09, 4 .194829728021432e-08, 2 .514032360068086e-08, 2 .8340199520471288e-08, 8 .747196034164517e-08, 8 .277125651545703e-09, 1 .1676293709683705e-08, 1 .4548514570833504e-07, 7 .200282148289716e-09, 2 .623600948936655e-06, 5 .675736929333652e-07, 1 .9483527466945816e-06, 6 .752595282932816e-08, 8 .168475318370838e-08, 1 .0933046468153407e-07, 1 .670913718498923e-07, 3 .1387276777650186e-08, 2 .973524537708272e-08, 5 .752163900751839e-08, 5 .850877471402782e-08, 3 .2544622285968217e-07, 3 .330221431951941e-08, 4 .186786668469722e-07, 1 .5085906568401697e-07, 2 .3346819943981245e-07, 2 .86402780602657e-07, 2 .2940319865938363e-07, 1 .8537603807544656e-07, 3 .151798182443599e-07, 1 .1075967449869495e-06, 1 .5369782602192572e-07, 1 .9237509718550427e-07, 1 .64044664074936e-07, 2 .900835340824415e-07, 1 .246654903752642e-07, 5 .802622027317739e-08, 5 .186220519703966e-08, 6 .0094205167615655e-09, 1 .2333241272699524e-07, 1 .3798474185477971e-07, 1 .7370231830682314e-07, 5 .617761189569137e-07, 5 .1604470030497396e-08, 4 .813277598714194e-08, 8 .032698417537176e-08, 2 .0645263703045202e-06, 5 .638597713186755e-07, 8 .794199857220519e-07, 3 .4785980460583232e-06, 2 .972389268052211e-07, 3 .3904532870110415e-07, 9 .469074058188198e-08, 3 .754845678827223e-08, 1 .5679037801419327e-07, 8 .203105039683578e-08, 6 .847962641387539e-09, 1 .8251624211984563e-08, 6 .050240841659615e-08, 3 .956342808919544e-08, 1 .0699947949888156e-07, 3 .2566634899922065e-07, 3 .5369430406717584e-07, 7 .326295303755614e-08, 4 .85765610847011e-07, 7 .717713401689252e-07, 3 .4567779749750116e-08, 3 .246204585138912e-07, 3 .1608601602783892e-06, 5 .33099466792919e-08, 3 .645687343123427e-07, 5 .48158936908294e-07, 4 .62306957160763e-08, 1 .3466177506415988e-07, 4 .3529482240955986e-08, 1 .6404105451783835e-07, 2 .463695381038633e-08, 5 .958712634424046e-08, 9 .493651020875404e-08, 5 .523462576206839e-08, 5 .7412357534758485e-08, 1 .1850350347231142e-05, 5 .8263944993086625e-06, 7 .4208674050169066e-06, 9 .127966222877149e-07, 2 .0019581370434025e-06, 1 .033498961078294e-06, 3 .5146850763112525e-08, 2 .058995278275688e-06, 3 .5655509122989315e-07, 6 .873234070781109e-08, 2 .1935298022413008e-09, 5 .560363547374436e-08, 3 .3266996979364194e-07, 1 .307369217329324e-07, 2 .718762992515167e-08, 1 .0462929189714032e-08, 7 .466680358447775e-07, 6 .923166040451179e-08, 1 .6145664361033596e-08, 8 .568521003837759e-09, 4 .76221018175238e-09, 1 .233977116044116e-07, 8 .340628632197422e-09, 3 .2649041248333788e-09, 5 .0632489312363305e-09, 4 .0704994930251814e-09, 1 .2043538610839732e-08, 5 .105608380517879e-09, 7 .267142887457112e-09, 1 .184516307262129e-07, 7 .53557927168913e-08, 6 .386964201965384e-08, 1 .6212936770898523e-08, 2 .610429419291904e-07, 6 .979425393183192e-07, 6 .647513117741255e-08, 7 .717492849224072e-07, 6 .651206945207377e-07, 3 .324495310152997e-07, 3 .707282019149716e-07, 3 .99564243025452e-07, 6 .411632114122767e-08, 7 .107352217872176e-08, 1 .6380016631956096e-07, 6 .876800995314625e-08, 3 .462474467141874e-07, 2 .0256503319160402e-07, 6 .19610148078209e-07, 2 .6841073363925716e-08, 6 .720335363752383e-07, 1 .1348340649419697e-06, 1 .8397931853542104e-06, 6 .397251581802266e-07, 7 .257533241045167e-08, 4 .2213909523525217e-07, 3 .9657925299252383e-07, 1 .4037439655112394e-07, 3 .249856774800719e-07, 1 .5857655455420172e-07, 1 .1122217102865761e-07, 7 .391420808744442e-08, 3 .42322238111592e-07, 5 .39796154441774e-08, 8 .517296379295658e-08, 4 .061009803990601e-06, 1 .4478755474556237e-05, 7 .317032757470088e-09, 6 .9484960008026064e-09, 4 .468917325084476e-08, 9 .23141172393116e-08, 5 .411982328951126e-08, 2 .2242811326123046e-07, 1 .7609554703312824e-08, 2 .0906279374344194e-08, 3 .6797682678724186e-09, 6 .177919686933819e-08, 1 .7920288541972695e-07, 2 .6279179721200308e-08, 2 .6988200119149042e-08, 1 .6432807115052128e-07, 1 .2827612749788386e-07, 4 .468908798571647e-08, 6 .316552969565237e-08, 1 .9461760203398626e-08, 2 .087125849925542e-08, 2 .2414580413965268e-08, 2 .4765244077684656e-08, 6 .785398465325443e-09, 2 .4248794971981624e-08, 4 .554979504689527e-09, 2 .8977037658250993e-08, 2 .0402325162649504e-08, 1 .600950270130852e-07, 2 .0199709638291097e-07, 1 .611188515937556e-08, 5 .964113825029926e-08, 4 .098318573397819e-09, 3 .9080127578472457e-08, 7 .511338218080255e-09, 5 .965624154669058e-07, 1 .6478223585636442e-07, 1 .4106989354445432e-08, 3 .2855584919389e-08, 3 .3387166364917675e-09, 1 .220043444050134e-08, 4 .624639160510924e-08, 6 .842309385746148e-09, 1 .74262879681919e-08, 4 .6611329906909305e-08, 9 .331947836699328e-08, 1 .2306078644996887e-07, 1 .2359445022980253e-08, 1 .1173199254699284e-08, 2 .7724862405875683e-08, 2 .419210147763806e-07, 3 .451186785241589e-07, 2 .593766978975509e-08, 9 .964568192799561e-08, 9 .797809674694236e-09, 1 .9085564417764544e-07, 3 .972706252852731e-08, 2 .6639204619982593e-08, 6 .874148805735558e-09, 3 .146993776681484e-08, 2 .4086594407890516e-07, 1 .3126927456141857e-07, 2 .1254339799270383e-07, 2 .050203384840188e-08, 3 .694976058454813e-08, 6 .563175816154398e-07, 2 .560050127442537e-08, 2 .6882981174480847e-08, 6 .880636078676616e-07, 2 .0092733166166e-07, 2 .788039665801989e-08, 2 .628409134786125e-08, 5 .1678345158734373e-08, 1 .8935413947929192e-07, 4 .61852835087484e-07, 1 .1086777718105623e-08, 1 .4542604276357451e-07, 2 .8737009216683873e-08, 6 .105167926762078e-07, 1 .2016463379893594e-08, 1 .3944705301582871e-07, 2 .093712758721722e-08, 4 .3801410498645055e-08, 1 .966320795077081e-08, 6 .654448991838535e-09, 1 .1149590584125235e-08, 6 .424939158478082e-08, 6 .971554888934861e-09, 3 .260019587614238e-09, 1 .4260189473702667e-08, 2 .7895078247297533e-08, 8 .11578289017234e-08, 2 .5995715802196173e-08, 2 .2855578762914774e-08, 1 .055962854934478e-07, 8 .145542551574181e-08, 3 .7793686402665116e-08, 4 .881891513264236e-08, 2 .342062366267328e-08, 1 .059935517133681e-08, 3 .604105103249822e-08, 5 .062430830093945e-08, 3 .6804440384230475e-08, 1 .501580193519203e-09, 1 .4475033367489232e-06, 1 .076210423889279e-06, 1 .304991315009829e-07, 3 .073601462233455e-08, 1 .7184021317007137e-08, 2 .0421090596300928e-08, 7 .904992216367646e-09, 1 .6902052379919041e-07, 1 .2416506933732308e-08, 5 .4758292122869534e-08, 2 .6250422280327257e-08, 1 .3261367115546818e-08, 6 .29807459517906e-08, 1 .270998595259698e-08, 2 .0171681569536304e-07, 4 .386637186826192e-08, 6 .962349630157405e-08, 2 .9565120485131047e-07, 7 .925131626507209e-07, 2 .0868920103112032e-07, 1 .7341794489311724e-07, 4 .2942417621816276e-08, 4 .213406956665722e-09, 8 .824785169281313e-08, 1 .7341569957807224e-08, 7 .321587247588468e-08, 1 .7941774288487977e-08, 1 .1245148101579616e-07, 4 .242405395871174e-07, 8 .259573469615589e-09, 1 .1336403105133286e-07, 8 .268798978861014e-08, 2 .2186977588489754e-08, 1 .9539720952366224e-08, 1 .0675703876472653e-08, 3 .288517547161973e-08, 2 .4340963022950746e-08, 6 .639137239972115e-08, 5 .604687380866835e-09, 1 .386604697728444e-08, 6 .675873720496384e-08, 1 .1355886009312144e-08, 3 .132159633878473e-07, 3 .12451788886392e-08, 1 .502181845580708e-07, 1 .3461754377885882e-08, 1 .8882955998833495e-07, 4 .645742279762999e-08, 4 .6453880742092224e-08, 7 .714453964524637e-09, 3 .5857155467056145e-08, 7 .60832108426257e-09, 4 .221501370693659e-08, 4 .3407251126836854e-09, 1 .340157496088068e-08, 8 .565600495558101e-08, 1 .7045413969185574e-08, 5 .4221903411644234e-08, 3 .021912675649219e-08, 6 .153376119755194e-08, 3 .938857240370908e-09, 4 .135628017820636e-08, 1 .781920389021252e-08, 4 .3105885083605244e-08, 3 .903354972578654e-09, 7 .663085455078544e-08, 1 .1890405993142394e-08, 9 .304217840622186e-09, 1 .0968062014171664e-09, 1 .0536767902635802e-08, 1 .1516804221400889e-07, 8 .134522886393825e-07, 5 .952623993721318e-08, 2 .806350174466843e-08, 1 .2833099027886874e-08, 1 .0605690192733164e-07, 7 .872949936427176e-07, 2 .7501393162765453e-08, 3 .936289072470345e-09, 2 .0519442145428002e-08, 7 .394815870753746e-09, 3 .598397313453461e-08, 2 .5378517065632877e-08, 4 .698972233541099e-08, 7 .54952989012736e-09, 6 .322805461422831e-07, 5 .582006412652163e-09, 1 .29640980617296e-07, 1 .5874988434916304e-08, 3 .3837810775594335e-08, 6 .474512037613067e-09, 9 .121148281110436e-08, 1 .3918511676536127e-08, 8 .230025549949005e-09, 2 .7061290097663004e-08, 2 .6095918315149902e-08, 5 .722363471960534e-09, 6 .963475698285038e-07, 4 .685091781198025e-08, 9 .590579885809802e-09, 2 .099205858030473e-07, 3 .082160660028421e-08, 3 .563162565001221e-08, 7 .326312925215461e-07, 2 .1759731225756695e-06, 2 .407518309155421e-07, 2 .974515780351794e-07, 2 .529018416908002e-08, 7 .667950718825978e-09, 2 .663289251358947e-07, 3 .4358880185436647e-08, 2 .3130198201215535e-08, 3 .1239693498719134e-08, 2 .8691621878351725e-07, 3 .895845068768722e-08, 2 .4184130253956937e-08, 1 .1582445225144511e-08, 5 .1545349322168477e-08, 2 .034345492063494e-08, 8 .201963197507212e-08, 1 .164153573540716e-08, 5 .496356720868789e-07, 1 .1682151246361627e-08, 4 .7576914852243135e-08, 1 .6349824605299546e-08, 4 .090862759653646e-08, 2 .1271189609706198e-07, 1 .6697286753242224e-07, 3 .989708119433999e-08, 2 .852450279533514e-06, 1 .2500372292834072e-07, 2 .4846613655427063e-07, 1 .245429093188477e-08, 2 .9700272463628608e-08, 4 .250991558762962e-09, 1 .61443480806156e-07, 2 .6386018703306036e-07, 7 .638056409575711e-09, 3 .4455793773702226e-09, 7 .273289526210647e-08, 1 .7631434090503717e-08, 7 .58661311550668e-09, 2 .1547013062672704e-08, 1 .2675349125856883e-07, 2 .5637149292379036e-08, 3 .500976220038865e-08, 6 .472243541111311e-08, 8 .387915251262257e-09, 3 .069512288789156e-08, 7 .520387867998579e-08, 1 .5724964441687916e-07, 1 .9634005354873807e-07, 1 .2290831818972947e-07, 1 .112118730439704e-09, 1 .546895944670723e-08, 9 .91701032404535e-09, 6 .882473257974198e-07, 8 .267616635748709e-08, 4 .469531234008173e-08, 2 .075201344098332e-08, 8 .649378457903367e-08, 5 .202766573120243e-08, 4 .5564942041664835e-08, 2 .0319955496006514e-08, 8 .705182352741758e-09, 6 .452066969586667e-08, 2 .1777438519166026e-08, 1 .030954166481024e-08, 3 .211904342492744e-08, 2 .3336936294526822e-07, 8 .054096056753224e-09, 1 .9623354319264763e-07, 1 .2888089884199871e-07, 1 .5392496166555247e-08, 1 .401903038100727e-09, 5 .696818305978013e-08, 6 .080025372057207e-09, 1 .0782793324892737e-08, 2 .4260730313585555e-08, 1 .9388659566743627e-08, 2 .2970310453729326e-07, 1 .9971754028347277e-08, 2 .8477993296860404e-08, 5 .2273552597625894e-08, 2 .7392806600801123e-07, 9 .857291161097237e-08, 3 .12910977129377e-08, 4 .151442212219081e-08, 5 .251196366629074e-09, 1 .580681100676884e-06, 8 .547603442821128e-07, 1 .068913135782168e-08, 1 .0621830597301596e-06, 7 .737313012512459e-08, 6 .394216711669287e-08, 1 .1698345758759388e-07, 1 .0486609625104393e-07, 2 .1161000063329993e-07, 1 .53396815250062e-08, 5 .094453570109181e-08, 1 .4005379966874898e-08, 2 .6282036102998063e-08, 8 .778433624456738e-08, 7 .772066545896905e-09, 4 .228875383205377e-08, 3 .3243779284930497e-07, 7 .729244799747903e-08, 7 .636901111496286e-10, 5 .989500806435899e-08, 1 .326090597331131e-07, 1 .2853634245857393e-07, 8 .844242671557367e-09, 1 .0194374766570036e-07, 2 .493779334145074e-07, 1 .6547971881664125e-07, 1 .1762754326127833e-08, 1 .1496195639892903e-07, 2 .9342709240154363e-07, 1 .326124099421122e-08, 8 .630262726683213e-08, 5 .7394842656322e-08, 1 .1094081031615133e-07, 2 .2933713239581266e-07, 3 .4706170026765903e-07, 1 .4751107357824367e-07, 1 .502495017291494e-08, 6 .454319390059027e-08, 5 .164533689594464e-08, 6 .23741556182722e-08, 1 .293601457064142e-07, 1 .4052071506398534e-08, 5 .386946000385251e-08, 2 .0827554791935654e-08, 1 .3040637902861363e-08, 1 .0578981601838677e-07, 1 .5079727688771527e-08, 8 .92632726845477e-07, 4 .6374381668101705e-08, 7 .481006036869076e-07, 5 .883147302654379e-09, 2 .8707685117979054e-09, 8 .381598490814213e-07, 7 .341958596640552e-09, 1 .4245998158912698e-08, 1 .0926417104428765e-07, 1 .1308178216040687e-07, 2 .52339901862797e-07, 1 .1782835684925885e-07, 4 .6678056975224536e-08, 2 .7959197179683315e-09, 3 .4363861090014325e-08, 1 .4674496640054713e-07, 3 .5396915620822256e-08, 2 .0581127557761647e-07, 7 .18387909159901e-08, 2 .7693943138729082e-08, 4 .5493386835460115e-08, 1 .9559182717898693e-08, 1 .5359708172013598e-08, 1 .2336623278486059e-08, 2 .9570605519779747e-08, 2 .877552560676122e-07, 9 .051845495378075e-07, 2 .3732602016934834e-07, 1 .6521676471370483e-08, 1 .5478875070584763e-08, 3 .526786329643983e-08, 3 .616410637619083e-08, 1 .61590953950963e-08, 7 .65007328595857e-08, 1 .9661483108279754e-08, 4 .917534823789538e-08, 1 .1712612746350715e-07, 1 .0889253054813253e-08, 1 .494120169809321e-06, 1 .018585660261806e-08, 3 .7575969003000864e-08, 2 .097097784314883e-08, 3 .368558054717141e-08, 4 .845588819080149e-09, 6 .039624622644624e-07, 1 .037331109898787e-08, 2 .841650257323636e-07, 4 .4990630954089283e-07, 3 .463186004637464e-08, 7 .720684180867465e-08, 1 .471122175189521e-07, 1 .1601575522490748e-07, 4 .007488030310924e-07, 3 .025649775167949e-08, 6 .706784461130155e-08, 2 .0128741340386114e-08, 1 .5987744461654074e-09, 4 .1919822280078733e-08, 1 .3167154477855547e-08, 3 .231814815762846e-08, 9 .247659704669786e-08, 1 .3075300842047e-07, 1 .0574301256838226e-07, 3 .762165334819656e-08, 1 .0942246575496029e-07, 7 .001474955359299e-08, 2 .742706151082075e-08, 2 .0766625752344225e-08, 4 .5403403703403455e-08, 3 .39040298058535e-08, 1 .0469661759771043e-07, 2 .8271578855765256e-08, 3 .406226767310727e-07, 5 .146206945028098e-07, 6 .740708613506285e-07, 6 .382248063374618e-09, 3 .63878704945364e-08, 3 .626059807970705e-08, 1 .6065602892467723e-07, 3 .639055989879125e-07, 6 .232691696084203e-09, 4 .805490050330263e-08, 3 .372633727849461e-08, 6 .328880317596486e-07, 6 .480631498106959e-08, 2 .1165197949812864e-07, 8 .38779143919055e-08, 1 .7589144363228115e-08, 2 .729027670511641e-09, 2 .144795097080987e-08, 7 .861271456022223e-08, 2 .0118186228046397e-08, 2 .8407685093156942e-08, 2 .4922530883486615e-07, 2 .0156670998972004e-08, 2 .6551649767725394e-08, 2 .7848242822869906e-08, 6 .907123761834555e-09, 1 .880543720744754e-08, 1 .3006903998302732e-08, 3 .685918272822164e-07, 3 .967941211158177e-07, 2 .7592133022835696e-08, 2 .5228947819755376e-08, 1 .547002881352455e-07, 3 .689306637966183e-08, 1 .440177199718562e-09, 2 .1504929392790473e-08, 5 .068111263994979e-08, 5 .081711407228795e-08, 1 .171875219085905e-08, 5 .409278358570191e-08, 7 .138276600926474e-07, 2 .5237213208129106e-07, 7 .072044638789521e-08, 7 .199763984999663e-08, 1 .2525473103153217e-08, 3 .4803417747752974e-07, 1 .9591827538079087e-07, 1 .2404700555634918e-07, 1 .234617457157583e-07, 1 .9201337408958352e-08, 1 .9895249181445251e-07, 3 .7876677794201896e-08, 1 .0629785052174157e-08, 1 .2437127772102485e-08, 2 .1861892207653e-07, 2 .6181456291851646e-07, 1 .112900775979142e-07, 1 .0776630432474121e-07, 6 .380325157095967e-09, 3 .895085143312826e-09, 1 .5762756788717525e-07, 2 .909027019271093e-09, 1 .0381050685737137e-08, 2 .8135211493918177e-08, 1 .0778002490496874e-08, 1 .3605974125141529e-08, 2 .9236465692861202e-08, 1 .9189795352758665e-07, 2 .199506354827463e-07, 1 .326399790002597e-08, 4 .9004846403022384e-08, 2 .980837132682268e-09, 8 .926045680368588e-09, 1 .0996975774446582e-08, 7 .71560149104289e-09, 7 .454491246505768e-09, 5 .086162246925596e-08, 1 .5129764108223753e-07, 1 .1960075596562092e-08, 1 .1323334270230134e-08, 9 .391332156383214e-09, 9 .585701832293125e-08, 1 .905532798218701e-08, 1 .8105303922766325e-08, 6 .179227796110354e-08, 6 .389401363549041e-08, 1 .1853179771037503e-08, 9 .37277544466042e-09, 1 .2332148457971925e-07, 1 .6522022860954166e-08, 1 .246116454467483e-07, 4 .196171854431441e-09, 3 .996593278543514e-08, 1 .2554556505506298e-08, 1 .4302138140465104e-08, 6 .631793780798034e-09, 5 .964224669696705e-09, 5 .556936244488497e-09, 1 .4192455921602232e-07, 1 .7613080771639034e-08, 3 .380189639301534e-07, 7 .85651934620546e-08, 2 .966783085867064e-08, 2 .8992105853831163e-06, 1 .3787366697215475e-06, 5 .313622430946907e-09, 2 .512852859126724e-08, 8 .406627216572815e-08, 4 .492839167369311e-08, 5 .408793057881667e-08, 2 .4239175999696272e-08, 4 .016805235096399e-07, 4 .1083545454512205e-08, 5 .4153481698904216e-08, 8 .640767212853007e-09, 5 .773256717134245e-08, 2 .6443152023603034e-07, 8 .953217047746875e-07, 2 .7994001783326894e-08, 5 .889480014786841e-09, 4 .1788819515886644e-08, 2 .8880645430717777e-08, 2 .135752907861388e-08, 2 .3024175277441827e-07, 8 .786625471657317e-08, 2 .0697297209437693e-09, 2 .236410523437371e-08, 3 .203276310870251e-09, 1 .176874686592555e-08, 6 .963571053120177e-08, 2 .271932153519174e-08, 7 .360382525689602e-09, 6 .922528772435044e-09, 3 .213871480056696e-08, 1 .370577820125618e-07, 1 .9815049157045905e-08, 1 .0578956377571558e-08, 2 .7049420481262132e-08, 2 .9755937713815683e-09, 2 .1773699288019088e-08, 1 .09755387001087e-08, 1 .991872444762066e-08, 2 .3882098076910552e-08, 2 .1357365653784655e-08, 6 .109098560358461e-09, 1 .1890497475519624e-08, 1 .1459891702259029e-08, 3 .73173456580389e-08, 1 .572620256240498e-08, 3 .404023374287135e-08, 3 .6921580459647885e-08, 9 .281765045443535e-08, 1 .2323201303843234e-07, 4 .2347593876002065e-08, 1 .7423728237986325e-08, 5 .8113389656000436e-08, 3 .931436154402945e-08, 2 .3690461148362374e-08, 1 .792850135018398e-08, 1 .440664210150544e-08, 7 .019830494670032e-09, 6 .041522482291839e-08, 4 .867479930226182e-08, 1 .0685319296044327e-08, 1 .0051243393149889e-08, 4 .2426261614991745e-08, 2 .607815297039906e-08, 5 .136670200300 841e-09, 1 .69729952315123e-09, 1 .9131586981302462e-08, 2 .111743526711507e-07, 1 .337269672774255e-08, 2 .0002481448955223e-08, 1 .0454256482717028e-07, 2 .8144228281234973e-08, 2 .1344791889532644e-07, 2 .1046110632028103e-08, 1 .9114453664315079e-07, 3 .957693550660224e-08, 2 .931631826186276e-08, 1 .105203111251285e-07, 4 .84007678380749e-08, 5 .583606110803885e-08, 1 .2130111315400427e-07, 1 .77621615193857e-08, 2 .5610853882085394e-08, 1 .203865309662433e-07, 4 .674859610531712e-09, 1 .5916098661250544e-08, 3 .147594185293201e-08, 6 .147686093527227e-08, 2 .204641802450169e-08, 3 .257763410147163e-07, 1 .198914532096751e-07, 2 .3818989802748547e-07, 1 .4909986134625797e-08, 5 .10168831624469e-08, 5 .5142201915714395e-08, 2 .288550327023131e-08, 5 .714110073995471e-08, 5 .185095801607531e-07, 4 .977285783525076e-08, 1 .1049896109227575e-08, 1 .264099296349741e-07, 8 .174881571676451e-08 ]]} * Connection #0 to host localhost left intact","title":"Run a Prediction"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/paddle/#test-the-model-locally","text":"Once you've got your model serialised model.pdmodel , we can then use KServe Paddle Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/paddle/#using-kserve-paddleserver","text":"","title":"Using KServe PaddleServer"},{"location":"modelserving/v1beta1/paddle/#pre-requisites","text":"Firstly, to use KServe Paddle server locally, you will first need to install the paddleserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install paddleserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/paddleserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/paddle/#serving-model-locally","text":"The paddleserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the paddleserver runtime package installed locally, you should now be ready to start our server as: python3 paddleserver --model_dir /path/to/model_dir --model_name paddle-v2-resnet50","title":"Serving model locally"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f paddle-v2-resnet.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/paddle/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can use the example payload jay-v2.json as the sample input to test the model. Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/paddle-v2-resnet50/infer Expected Output { \"model_name\" : \"paddle-v2-resnet50\" , \"model_version\" : null , \"id\" : \"afa5ec0b-a5c7-454b-a464-53ba931b22df\" , \"parameters\" : null , \"outputs\" :[{ \"name\" : \"output-0\" , \"shape\" :[ 1 , 1000 ], \"datatype\" : \"FP32\" , \"parameters\" : null , \"data\" :[ 6.736678770380422e-9 , 1.1535990829258935e-8 , 5.142250714129659e-8 , 6.647170636142619e-8 , 4.094492567219277e-8 , 1.3402451770616608e-7 , 9.355561303436843e-8 , 2.8935891904779965e-8 , 6.845367295227334e-8 , 7.680615965455218e-8 , 0.0000020334689452283783 , 0.0000011085678579547675 , 2.3477592492326949e-7 , 6.582037030966603e-7 , 0.00012373103527352214 , 4.2878804151769145e-7 , 0.000006419959845516132 , 0.9993496537208557 , 0.00007372002437477931 , 0.00003101135735050775 , 0.0000056028093240456656 , 0.0000021862508674530545 , 1.9544044604913324e-8 , 3.728893887000595e-7 , 4.2903633357127546e-7 , 1.8251179767503345e-7 , 7.159925985433802e-8 , 9.231618136595898e-9 , 6.469241498052725e-7 , 7.031690341108288e-9 , 4.451231561120039e-8 , 1.2455971898361895e-7 , 9.44632745358831e-8 , 4.347704418705689e-8 , 4.658220120745682e-7 , 6.797721141538204e-8 , 2.1060276367279585e-7 , 2.2605123106700376e-8 , 1.4311490303953178e-7 , 7.951298641728499e-8 , 1.2341783417468832e-7 , 0.0000010921713737843675 , 0.000015243892448779661 , 3.1173343018053856e-7 , 2.4152058131221565e-7 , 6.863762536113427e-8 , 8.467682022228473e-8 , 9.4246772164297e-8 , 1.0219210366813058e-8 , 3.3770753304906975e-8 , 3.6928835100979995e-8 , 1.3694031508748594e-7 , 1.0674284567357972e-7 , 2.599483650556067e-7 , 3.4866405940192635e-7 , 3.132053549848024e-8 , 3.574873232992104e-7 , 6.64843895492595e-8 , 3.1638955988455564e-7 , 0.0000012095878219042788 , 8.66409024524728e-8 , 4.0144172430700564e-8 , 1.2544761318622477e-7 , 3.3201178695208e-8 , 1.9731444922399533e-7 , 3.806405572959193e-7 , 1.3827865075199952e-7 , 2.300225965257141e-8 , 7.14422512260171e-8 , 2.851114544455413e-8 , 2.982567437470607e-8 , 8.936032713791064e-8 , 6.22388370175031e-7 , 6.478838798784636e-8 , 1.3663023423760023e-7 , 9.973181391842445e-8 , 2.5761554667269593e-8 , 4.130220077058766e-8 , 3.9384463690339544e-8 , 1.2158079698565416e-7 , 0.000004302821707824478 , 0.0000018179063090428826 , 0.0000018520155435908237 , 0.0000016246107179540559 , 0.000016448313544970006 , 0.000010544916221988387 , 0.000003993061909568496 , 2.646479799750523e-7 , 0.000019193475964129902 , 4.803242745765601e-7 , 1.696285067964709e-7 , 0.000004550505764200352 , 0.00004235929372953251 , 0.000004443338639248395 , 0.000005104009687784128 , 0.000013506396498996764 , 4.1758724478313525e-7 , 4.494491463447048e-7 , 3.156698369366495e-7 , 0.0000010557599807725637 , 1.336463917311903e-8 , 1.3893659556174498e-8 , 6.770379457066156e-8 , 1.4129696523923485e-7 , 7.170518756538513e-8 , 7.934466594861078e-8 , 2.639154317307657e-8 , 2.6134321373660896e-8 , 7.196725881897237e-9 , 2.1752363466021052e-8 , 6.684639686227456e-8 , 3.417795824134373e-8 , 1.6228275967478112e-7 , 4.107114648377319e-7 , 6.472135396506928e-7 , 2.951379372007068e-7 , 5.653474133282543e-9 , 4.830144462175667e-8 , 8.887481861563629e-9 , 3.7306168820805397e-8 , 1.7784264727538357e-8 , 4.641905082536368e-9 , 3.413118676576232e-8 , 1.937393818707278e-7 , 0.0000012980176506971475 , 3.5641004814124244e-8 , 2.149332445355867e-8 , 3.055293689158134e-7 , 1.5532516783878236e-7 , 0.0000014520978766086046 , 3.488464628276233e-8 , 0.00003825438398052938 , 4.5088432898410247e-7 , 4.1766969616219285e-7 , 6.770622462681786e-7 , 1.4142248971893423e-7 , 0.000014235997696232516 , 6.293820433711517e-7 , 0.000004762866865348769 , 9.024900577969674e-7 , 9.058987870957935e-7 , 0.0000015713684433649178 , 1.5720647184025438e-7 , 1.818536503606083e-7 , 7.193188622522939e-8 , 0.0000011952824934269302 , 8.874837362782273e-7 , 2.0870831463071227e-7 , 9.906239029078279e-8 , 7.793621747964607e-9 , 1.0058498389753368e-7 , 4.2059440374941914e-7 , 1.843624630737395e-7 , 1.6437947181202617e-7 , 7.025352743994517e-8 , 2.570448600636155e-7 , 7.586877615040066e-8 , 7.841313731660193e-7 , 2.495309274763713e-7 , 5.157681925993529e-8 , 4.0674127177453556e-8 , 7.531796519799627e-9 , 4.797485431140558e-8 , 1.7419973019627832e-8 , 1.7958679165985814e-7 , 1.2566392371127222e-8 , 8.975440124459055e-8 , 3.26965476915575e-8 , 1.1208359751435637e-7 , 3.906746215420753e-8 , 4.6769045525252295e-8 , 1.8523553535487736e-7 , 1.4833052830454108e-7 , 1.2279349448363064e-7 , 0.0000010729105497375713 , 3.6538490011395197e-9 , 1.6198403329781286e-7 , 1.6190719875908144e-8 , 1.2004933580556099e-7 , 1.4800277448046018e-8 , 4.02294837442696e-8 , 2.15060893538066e-7 , 1.1925696696835075e-7 , 4.8982514044837444e-8 , 7.608920071788816e-8 , 2.3137479487900237e-8 , 8.521050176568679e-8 , 9.586213423062873e-8 , 1.3351650807180704e-7 , 3.021699157557123e-8 , 4.423876376336011e-8 , 2.610667060309879e-8 , 2.3977091245797055e-7 , 1.3192564551900432e-7 , 1.6734931662654162e-8 , 1.588336999702733e-7 , 4.0643516285854275e-7 , 8.753454494581092e-8 , 8.366999395548191e-7 , 3.437598650180007e-8 , 7.847892646850596e-8 , 8.526394701391382e-9 , 9.601382799928615e-8 , 5.258924034023948e-7 , 1.3557448141909845e-7 , 1.0307226716577134e-7 , 1.0429813457335513e-8 , 5.187714435805901e-8 , 2.187001335585137e-8 , 1.1791439824548888e-8 , 2.98065643278278e-8 , 4.338393466696289e-8 , 2.9991046091026874e-8 , 2.8507610494443725e-8 , 3.058665143385042e-8 , 6.441099031917474e-8 , 1.5364101102477434e-8 , 1.5973883549236234e-8 , 2.5736850872704053e-8 , 1.0903765712555469e-7 , 3.2118737891551064e-8 , 6.819742992547617e-9 , 1.9251311300649832e-7 , 5.8258109447706374e-8 , 1.8765761922168167e-7 , 4.0070790419122204e-7 , 1.5791577823165426e-8 , 1.950158434738114e-7 , 1.0142063189277906e-8 , 2.744815041921811e-8 , 1.2843531571604672e-8 , 3.7297493094001766e-8 , 7.407496838141014e-8 , 4.20607833007125e-8 , 1.6924804668860816e-8 , 1.459203531339881e-7 , 4.344977000414474e-8 , 1.7191403856031684e-7 , 3.5817443233554513e-8 , 8.440249388286247e-9 , 4.194829728021432e-8 , 2.514032360068086e-8 , 2.8340199520471288e-8 , 8.747196034164517e-8 , 8.277125651545703e-9 , 1.1676293709683705e-8 , 1.4548514570833504e-7 , 7.200282148289716e-9 , 0.000002623600948936655 , 5.675736929333652e-7 , 0.0000019483527466945816 , 6.752595282932816e-8 , 8.168475318370838e-8 , 1.0933046468153407e-7 , 1.670913718498923e-7 , 3.1387276777650186e-8 , 2.973524537708272e-8 , 5.752163900751839e-8 , 5.850877471402782e-8 , 3.2544622285968217e-7 , 3.330221431951941e-8 , 4.186786668469722e-7 , 1.5085906568401697e-7 , 2.3346819943981245e-7 , 2.86402780602657e-7 , 2.2940319865938363e-7 , 1.8537603807544656e-7 , 3.151798182443599e-7 , 0.0000011075967449869495 , 1.5369782602192572e-7 , 1.9237509718550427e-7 , 1.64044664074936e-7 , 2.900835340824415e-7 , 1.246654903752642e-7 , 5.802622027317739e-8 , 5.186220519703966e-8 , 6.0094205167615655e-9 , 1.2333241272699524e-7 , 1.3798474185477971e-7 , 1.7370231830682314e-7 , 5.617761189569137e-7 , 5.1604470030497396e-8 , 4.813277598714194e-8 , 8.032698417537176e-8 , 0.0000020645263703045202 , 5.638597713186755e-7 , 8.794199857220519e-7 , 0.0000034785980460583232 , 2.972389268052211e-7 , 3.3904532870110415e-7 , 9.469074058188198e-8 , 3.754845678827223e-8 , 1.5679037801419327e-7 , 8.203105039683578e-8 , 6.847962641387539e-9 , 1.8251624211984563e-8 , 6.050240841659615e-8 , 3.956342808919544e-8 , 1.0699947949888156e-7 , 3.2566634899922065e-7 , 3.5369430406717584e-7 , 7.326295303755614e-8 , 4.85765610847011e-7 , 7.717713401689252e-7 , 3.4567779749750116e-8 , 3.246204585138912e-7 , 0.0000031608601602783892 , 5.33099466792919e-8 , 3.645687343123427e-7 , 5.48158936908294e-7 , 4.62306957160763e-8 , 1.3466177506415988e-7 , 4.3529482240955986e-8 , 1.6404105451783835e-7 , 2.463695381038633e-8 , 5.958712634424046e-8 , 9.493651020875404e-8 , 5.523462576206839e-8 , 5.7412357534758485e-8 , 0.000011850350347231142 , 0.0000058263944993086625 , 0.0000074208674050169066 , 9.127966222877149e-7 , 0.0000020019581370434025 , 0.000001033498961078294 , 3.5146850763112525e-8 , 0.000002058995278275688 , 3.5655509122989315e-7 , 6.873234070781109e-8 , 2.1935298022413008e-9 , 5.560363547374436e-8 , 3.3266996979364194e-7 , 1.307369217329324e-7 , 2.718762992515167e-8 , 1.0462929189714032e-8 , 7.466680358447775e-7 , 6.923166040451179e-8 , 1.6145664361033596e-8 , 8.568521003837759e-9 , 4.76221018175238e-9 , 1.233977116044116e-7 , 8.340628632197422e-9 , 3.2649041248333788e-9 , 5.0632489312363305e-9 , 4.0704994930251814e-9 , 1.2043538610839732e-8 , 5.105608380517879e-9 , 7.267142887457112e-9 , 1.184516307262129e-7 , 7.53557927168913e-8 , 6.386964201965384e-8 , 1.6212936770898523e-8 , 2.610429419291904e-7 , 6.979425393183192e-7 , 6.647513117741255e-8 , 7.717492849224072e-7 , 6.651206945207377e-7 , 3.324495310152997e-7 , 3.707282019149716e-7 , 3.99564243025452e-7 , 6.411632114122767e-8 , 7.107352217872176e-8 , 1.6380016631956096e-7 , 6.876800995314625e-8 , 3.462474467141874e-7 , 2.0256503319160402e-7 , 6.19610148078209e-7 , 2.6841073363925716e-8 , 6.720335363752383e-7 , 0.0000011348340649419697 , 0.0000018397931853542104 , 6.397251581802266e-7 , 7.257533241045167e-8 , 4.2213909523525217e-7 , 3.9657925299252383e-7 , 1.4037439655112394e-7 , 3.249856774800719e-7 , 1.5857655455420172e-7 , 1.1122217102865761e-7 , 7.391420808744442e-8 , 3.42322238111592e-7 , 5.39796154441774e-8 , 8.517296379295658e-8 , 0.000004061009803990601 , 0.000014478755474556237 , 7.317032757470088e-9 , 6.9484960008026064e-9 , 4.468917325084476e-8 , 9.23141172393116e-8 , 5.411982328951126e-8 , 2.2242811326123046e-7 , 1.7609554703312824e-8 , 2.0906279374344194e-8 , 3.6797682678724186e-9 , 6.177919686933819e-8 , 1.7920288541972695e-7 , 2.6279179721200308e-8 , 2.6988200119149042e-8 , 1.6432807115052128e-7 , 1.2827612749788386e-7 , 4.468908798571647e-8 , 6.316552969565237e-8 , 1.9461760203398626e-8 , 2.087125849925542e-8 , 2.2414580413965268e-8 , 2.4765244077684656e-8 , 6.785398465325443e-9 , 2.4248794971981624e-8 , 4.554979504689527e-9 , 2.8977037658250993e-8 , 2.0402325162649504e-8 , 1.600950270130852e-7 , 2.0199709638291097e-7 , 1.611188515937556e-8 , 5.964113825029926e-8 , 4.098318573397819e-9 , 3.9080127578472457e-8 , 7.511338218080255e-9 , 5.965624154669058e-7 , 1.6478223585636442e-7 , 1.4106989354445432e-8 , 3.2855584919389e-8 , 3.3387166364917675e-9 , 1.220043444050134e-8 , 4.624639160510924e-8 , 6.842309385746148e-9 , 1.74262879681919e-8 , 4.6611329906909305e-8 , 9.331947836699328e-8 , 1.2306078644996887e-7 , 1.2359445022980253e-8 , 1.1173199254699284e-8 , 2.7724862405875683e-8 , 2.419210147763806e-7 , 3.451186785241589e-7 , 2.593766978975509e-8 , 9.964568192799561e-8 , 9.797809674694236e-9 , 1.9085564417764544e-7 , 3.972706252852731e-8 , 2.6639204619982593e-8 , 6.874148805735558e-9 , 3.146993776681484e-8 , 2.4086594407890516e-7 , 1.3126927456141857e-7 , 2.1254339799270383e-7 , 2.050203384840188e-8 , 3.694976058454813e-8 , 6.563175816154398e-7 , 2.560050127442537e-8 , 2.6882981174480847e-8 , 6.880636078676616e-7 , 2.0092733166166e-7 , 2.788039665801989e-8 , 2.628409134786125e-8 , 5.1678345158734373e-8 , 1.8935413947929192e-7 , 4.61852835087484e-7 , 1.1086777718105623e-8 , 1.4542604276357451e-7 , 2.8737009216683873e-8 , 6.105167926762078e-7 , 1.2016463379893594e-8 , 1.3944705301582871e-7 , 2.093712758721722e-8 , 4.3801410498645055e-8 , 1.966320795077081e-8 , 6.654448991838535e-9 , 1.1149590584125235e-8 , 6.424939158478082e-8 , 6.971554888934861e-9 , 3.260019587614238e-9 , 1.4260189473702667e-8 , 2.7895078247297533e-8 , 8.11578289017234e-8 , 2.5995715802196173e-8 , 2.2855578762914774e-8 , 1.055962854934478e-7 , 8.145542551574181e-8 , 3.7793686402665116e-8 , 4.881891513264236e-8 , 2.342062366267328e-8 , 1.059935517133681e-8 , 3.604105103249822e-8 , 5.062430830093945e-8 , 3.6804440384230475e-8 , 1.501580193519203e-9 , 0.0000014475033367489232 , 0.000001076210423889279 , 1.304991315009829e-7 , 3.073601462233455e-8 , 1.7184021317007137e-8 , 2.0421090596300928e-8 , 7.904992216367646e-9 , 1.6902052379919041e-7 , 1.2416506933732308e-8 , 5.4758292122869534e-8 , 2.6250422280327257e-8 , 1.3261367115546818e-8 , 6.29807459517906e-8 , 1.270998595259698e-8 , 2.0171681569536304e-7 , 4.386637186826192e-8 , 6.962349630157405e-8 , 2.9565120485131047e-7 , 7.925131626507209e-7 , 2.0868920103112032e-7 , 1.7341794489311724e-7 , 4.2942417621816276e-8 , 4.213406956665722e-9 , 8.824785169281313e-8 , 1.7341569957807224e-8 , 7.321587247588468e-8 , 1.7941774288487977e-8 , 1.1245148101579616e-7 , 4.242405395871174e-7 , 8.259573469615589e-9 , 1.1336403105133286e-7 , 8.268798978861014e-8 , 2.2186977588489754e-8 , 1.9539720952366224e-8 , 1.0675703876472653e-8 , 3.288517547161973e-8 , 2.4340963022950746e-8 , 6.639137239972115e-8 , 5.604687380866835e-9 , 1.386604697728444e-8 , 6.675873720496384e-8 , 1.1355886009312144e-8 , 3.132159633878473e-7 , 3.12451788886392e-8 , 1.502181845580708e-7 , 1.3461754377885882e-8 , 1.8882955998833495e-7 , 4.645742279762999e-8 , 4.6453880742092224e-8 , 7.714453964524637e-9 , 3.5857155467056145e-8 , 7.60832108426257e-9 , 4.221501370693659e-8 , 4.3407251126836854e-9 , 1.340157496088068e-8 , 8.565600495558101e-8 , 1.7045413969185574e-8 , 5.4221903411644234e-8 , 3.021912675649219e-8 , 6.153376119755194e-8 , 3.938857240370908e-9 , 4.135628017820636e-8 , 1.781920389021252e-8 , 4.3105885083605244e-8 , 3.903354972578654e-9 , 7.663085455078544e-8 , 1.1890405993142394e-8 , 9.304217840622186e-9 , 1.0968062014171664e-9 , 1.0536767902635802e-8 , 1.1516804221400889e-7 , 8.134522886393825e-7 , 5.952623993721318e-8 , 2.806350174466843e-8 , 1.2833099027886874e-8 , 1.0605690192733164e-7 , 7.872949936427176e-7 , 2.7501393162765453e-8 , 3.936289072470345e-9 , 2.0519442145428002e-8 , 7.394815870753746e-9 , 3.598397313453461e-8 , 2.5378517065632877e-8 , 4.698972233541099e-8 , 7.54952989012736e-9 , 6.322805461422831e-7 , 5.582006412652163e-9 , 1.29640980617296e-7 , 1.5874988434916304e-8 , 3.3837810775594335e-8 , 6.474512037613067e-9 , 9.121148281110436e-8 , 1.3918511676536127e-8 , 8.230025549949005e-9 , 2.7061290097663004e-8 , 2.6095918315149902e-8 , 5.722363471960534e-9 , 6.963475698285038e-7 , 4.685091781198025e-8 , 9.590579885809802e-9 , 2.099205858030473e-7 , 3.082160660028421e-8 , 3.563162565001221e-8 , 7.326312925215461e-7 , 0.0000021759731225756695 , 2.407518309155421e-7 , 2.974515780351794e-7 , 2.529018416908002e-8 , 7.667950718825978e-9 , 2.663289251358947e-7 , 3.4358880185436647e-8 , 2.3130198201215535e-8 , 3.1239693498719134e-8 , 2.8691621878351725e-7 , 3.895845068768722e-8 , 2.4184130253956937e-8 , 1.1582445225144511e-8 , 5.1545349322168477e-8 , 2.034345492063494e-8 , 8.201963197507212e-8 , 1.164153573540716e-8 , 5.496356720868789e-7 , 1.1682151246361627e-8 , 4.7576914852243135e-8 , 1.6349824605299546e-8 , 4.090862759653646e-8 , 2.1271189609706198e-7 , 1.6697286753242224e-7 , 3.989708119433999e-8 , 0.000002852450279533514 , 1.2500372292834072e-7 , 2.4846613655427063e-7 , 1.245429093188477e-8 , 2.9700272463628608e-8 , 4.250991558762962e-9 , 1.61443480806156e-7 , 2.6386018703306036e-7 , 7.638056409575711e-9 , 3.4455793773702226e-9 , 7.273289526210647e-8 , 1.7631434090503717e-8 , 7.58661311550668e-9 , 2.1547013062672704e-8 , 1.2675349125856883e-7 , 2.5637149292379036e-8 , 3.500976220038865e-8 , 6.472243541111311e-8 , 8.387915251262257e-9 , 3.069512288789156e-8 , 7.520387867998579e-8 , 1.5724964441687916e-7 , 1.9634005354873807e-7 , 1.2290831818972947e-7 , 1.112118730439704e-9 , 1.546895944670723e-8 , 9.91701032404535e-9 , 6.882473257974198e-7 , 8.267616635748709e-8 , 4.469531234008173e-8 , 2.075201344098332e-8 , 8.649378457903367e-8 , 5.202766573120243e-8 , 4.5564942041664835e-8 , 2.0319955496006514e-8 , 8.705182352741758e-9 , 6.452066969586667e-8 , 2.1777438519166026e-8 , 1.030954166481024e-8 , 3.211904342492744e-8 , 2.3336936294526822e-7 , 8.054096056753224e-9 , 1.9623354319264763e-7 , 1.2888089884199871e-7 , 1.5392496166555247e-8 , 1.401903038100727e-9 , 5.696818305978013e-8 , 6.080025372057207e-9 , 1.0782793324892737e-8 , 2.4260730313585555e-8 , 1.9388659566743627e-8 , 2.2970310453729326e-7 , 1.9971754028347277e-8 , 2.8477993296860404e-8 , 5.2273552597625894e-8 , 2.7392806600801123e-7 , 9.857291161097237e-8 , 3.12910977129377e-8 , 4.151442212219081e-8 , 5.251196366629074e-9 , 0.000001580681100676884 , 8.547603442821128e-7 , 1.068913135782168e-8 , 0.0000010621830597301596 , 7.737313012512459e-8 , 6.394216711669287e-8 , 1.1698345758759388e-7 , 1.0486609625104393e-7 , 2.1161000063329993e-7 , 1.53396815250062e-8 , 5.094453570109181e-8 , 1.4005379966874898e-8 , 2.6282036102998063e-8 , 8.778433624456738e-8 , 7.772066545896905e-9 , 4.228875383205377e-8 , 3.3243779284930497e-7 , 7.729244799747903e-8 , 7.636901111496286e-10 , 5.989500806435899e-8 , 1.326090597331131e-7 , 1.2853634245857393e-7 , 8.844242671557367e-9 , 1.0194374766570036e-7 , 2.493779334145074e-7 , 1.6547971881664125e-7 , 1.1762754326127833e-8 , 1.1496195639892903e-7 , 2.9342709240154363e-7 , 1.326124099421122e-8 , 8.630262726683213e-8 , 5.7394842656322e-8 , 1.1094081031615133e-7 , 2.2933713239581266e-7 , 3.4706170026765903e-7 , 1.4751107357824367e-7 , 1.502495017291494e-8 , 6.454319390059027e-8 , 5.164533689594464e-8 , 6.23741556182722e-8 , 1.293601457064142e-7 , 1.4052071506398534e-8 , 5.386946000385251e-8 , 2.0827554791935654e-8 , 1.3040637902861363e-8 , 1.0578981601838677e-7 , 1.5079727688771527e-8 , 8.92632726845477e-7 , 4.6374381668101705e-8 , 7.481006036869076e-7 , 5.883147302654379e-9 , 2.8707685117979054e-9 , 8.381598490814213e-7 , 7.341958596640552e-9 , 1.4245998158912698e-8 , 1.0926417104428765e-7 , 1.1308178216040687e-7 , 2.52339901862797e-7 , 1.1782835684925885e-7 , 4.6678056975224536e-8 , 2.7959197179683315e-9 , 3.4363861090014325e-8 , 1.4674496640054713e-7 , 3.5396915620822256e-8 , 2.0581127557761647e-7 , 7.18387909159901e-8 , 2.7693943138729082e-8 , 4.5493386835460115e-8 , 1.9559182717898693e-8 , 1.5359708172013598e-8 , 1.2336623278486059e-8 , 2.9570605519779747e-8 , 2.877552560676122e-7 , 9.051845495378075e-7 , 2.3732602016934834e-7 , 1.6521676471370483e-8 , 1.5478875070584763e-8 , 3.526786329643983e-8 , 3.616410637619083e-8 , 1.61590953950963e-8 , 7.65007328595857e-8 , 1.9661483108279754e-8 , 4.917534823789538e-8 , 1.1712612746350715e-7 , 1.0889253054813253e-8 , 0.000001494120169809321 , 1.018585660261806e-8 , 3.7575969003000864e-8 , 2.097097784314883e-8 , 3.368558054717141e-8 , 4.845588819080149e-9 , 6.039624622644624e-7 , 1.037331109898787e-8 , 2.841650257323636e-7 , 4.4990630954089283e-7 , 3.463186004637464e-8 , 7.720684180867465e-8 , 1.471122175189521e-7 , 1.1601575522490748e-7 , 4.007488030310924e-7 , 3.025649775167949e-8 , 6.706784461130155e-8 , 2.0128741340386114e-8 , 1.5987744461654074e-9 , 4.1919822280078733e-8 , 1.3167154477855547e-8 , 3.231814815762846e-8 , 9.247659704669786e-8 , 1.3075300842047e-7 , 1.0574301256838226e-7 , 3.762165334819656e-8 , 1.0942246575496029e-7 , 7.001474955359299e-8 , 2.742706151082075e-8 , 2.0766625752344225e-8 , 4.5403403703403455e-8 , 3.39040298058535e-8 , 1.0469661759771043e-7 , 2.8271578855765256e-8 , 3.406226767310727e-7 , 5.146206945028098e-7 , 6.740708613506285e-7 , 6.382248063374618e-9 , 3.63878704945364e-8 , 3.626059807970705e-8 , 1.6065602892467723e-7 , 3.639055989879125e-7 , 6.232691696084203e-9 , 4.805490050330263e-8 , 3.372633727849461e-8 , 6.328880317596486e-7 , 6.480631498106959e-8 , 2.1165197949812864e-7 , 8.38779143919055e-8 , 1.7589144363228115e-8 , 2.729027670511641e-9 , 2.144795097080987e-8 , 7.861271456022223e-8 , 2.0118186228046397e-8 , 2.8407685093156942e-8 , 2.4922530883486615e-7 , 2.0156670998972004e-8 , 2.6551649767725394e-8 , 2.7848242822869906e-8 , 6.907123761834555e-9 , 1.880543720744754e-8 , 1.3006903998302732e-8 , 3.685918272822164e-7 , 3.967941211158177e-7 , 2.7592133022835696e-8 , 2.5228947819755376e-8 , 1.547002881352455e-7 , 3.689306637966183e-8 , 1.440177199718562e-9 , 2.1504929392790473e-8 , 5.068111263994979e-8 , 5.081711407228795e-8 , 1.171875219085905e-8 , 5.409278358570191e-8 , 7.138276600926474e-7 , 2.5237213208129106e-7 , 7.072044638789521e-8 , 7.199763984999663e-8 , 1.2525473103153217e-8 , 3.4803417747752974e-7 , 1.9591827538079087e-7 , 1.2404700555634918e-7 , 1.234617457157583e-7 , 1.9201337408958352e-8 , 1.9895249181445251e-7 , 3.7876677794201896e-8 , 1.0629785052174157e-8 , 1.2437127772102485e-8 , 2.1861892207653e-7 , 2.6181456291851646e-7 , 1.112900775979142e-7 , 1.0776630432474121e-7 , 6.380325157095967e-9 , 3.895085143312826e-9 , 1.5762756788717525e-7 , 2.909027019271093e-9 , 1.0381050685737137e-8 , 2.8135211493918177e-8 , 1.0778002490496874e-8 , 1.3605974125141529e-8 , 2.9236465692861202e-8 , 1.9189795352758665e-7 , 2.199506354827463e-7 , 1.326399790002597e-8 , 4.9004846403022384e-8 , 2.980837132682268e-9 , 8.926045680368588e-9 , 1.0996975774446582e-8 , 7.71560149104289e-9 , 7.454491246505768e-9 , 5.086162246925596e-8 , 1.5129764108223753e-7 , 1.1960075596562092e-8 , 1.1323334270230134e-8 , 9.391332156383214e-9 , 9.585701832293125e-8 , 1.905532798218701e-8 , 1.8105303922766325e-8 , 6.179227796110354e-8 , 6.389401363549041e-8 , 1.1853179771037503e-8 , 9.37277544466042e-9 , 1.2332148457971925e-7 , 1.6522022860954166e-8 , 1.246116454467483e-7 , 4.196171854431441e-9 , 3.996593278543514e-8 , 1.2554556505506298e-8 , 1.4302138140465104e-8 , 6.631793780798034e-9 , 5.964224669696705e-9 , 5.556936244488497e-9 , 1.4192455921602232e-7 , 1.7613080771639034e-8 , 3.380189639301534e-7 , 7.85651934620546e-8 , 2.966783085867064e-8 , 0.0000028992105853831163 , 0.0000013787366697215475 , 5.313622430946907e-9 , 2.512852859126724e-8 , 8.406627216572815e-8 , 4.492839167369311e-8 , 5.408793057881667e-8 , 2.4239175999696272e-8 , 4.016805235096399e-7 , 4.1083545454512205e-8 , 5.4153481698904216e-8 , 8.640767212853007e-9 , 5.773256717134245e-8 , 2.6443152023603034e-7 , 8.953217047746875e-7 , 2.7994001783326894e-8 , 5.889480014786841e-9 , 4.1788819515886644e-8 , 2.8880645430717777e-8 , 2.135752907861388e-8 , 2.3024175277441827e-7 , 8.786625471657317e-8 , 2.0697297209437693e-9 , 2.236410523437371e-8 , 3.203276310870251e-9 , 1.176874686592555e-8 , 6.963571053120177e-8 , 2.271932153519174e-8 , 7.360382525689602e-9 , 6.922528772435044e-9 , 3.213871480056696e-8 , 1.370577820125618e-7 , 1.9815049157045905e-8 , 1.0578956377571558e-8 , 2.7049420481262132e-8 , 2.9755937713815683e-9 , 2.1773699288019088e-8 , 1.09755387001087e-8 , 1.991872444762066e-8 , 2.3882098076910552e-8 , 2.1357365653784655e-8 , 6.109098560358461e-9 , 1.1890497475519624e-8 , 1.1459891702259029e-8 , 3.73173456580389e-8 , 1.572620256240498e-8 , 3.404023374287135e-8 , 3.6921580459647885e-8 , 9.281765045443535e-8 , 1.2323201303843234e-7 , 4.2347593876002065e-8 , 1.7423728237986325e-8 , 5.8113389656000436e-8 , 3.931436154402945e-8 , 2.3690461148362374e-8 , 1.792850135018398e-8 , 1.440664210150544e-8 , 7.019830494670032e-9 , 6.041522482291839e-8 , 4.867479930226182e-8 , 1.0685319296044327e-8 , 1.0051243393149889e-8 , 4.2426261614991745e-8 , 2.607815297039906e-8 , 5.136670200300841e-9 , 1.69729952315123e-9 , 1.9131586981302462e-8 , 2.111743526711507e-7 , 1.337269672774255e-8 , 2.0002481448955223e-8 , 1.0454256482717028e-7 , 2.8144228281234973e-8 , 2.1344791889532644e-7 , 2.1046110632028103e-8 , 1.9114453664315079e-7 , 3.957693550660224e-8 , 2.931631826186276e-8 , 1.105203111251285e-7 , 4.84007678380749e-8 , 5.583606110803885e-8 , 1.2130111315400427e-7 , 1.77621615193857e-8 , 2.5610853882085394e-8 , 1.203865309662433e-7 , 4.674859610531712e-9 , 1.5916098661250544e-8 , 3.147594185293201e-8 , 6.147686093527227e-8 , 2.204641802450169e-8 , 3.257763410147163e-7 , 1.198914532096751e-7 , 2.3818989802748547e-7 , 1.4909986134625797e-8 , 5.10168831624469e-8 , 5.5142201915714395e-8 , 2.288550327023131e-8 , 5.714110073995471e-8 , 5.185095801607531e-7 , 4.977285783525076e-8 , 1.1049896109227575e-8 , 1.264099296349741e-7 , 8.174881571676451e-8 ]}]}","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f paddle-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/paddle/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = jay-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can use the example payload jay-v2-grpc.json as the sample input to test the model. Notice that the input format differs from the in the previous REST endpoint example. ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Tue, 10 Oct 2023 14 :55:27 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 190 Estimated response size: 4093 bytes Response contents: { \"modelName\" : \"paddle-v2-resnet50-grpc\" , \"id\" : \"97db0c56-95d2-4171-afd5-f7609a87032d\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"1000\" ] , \"contents\" : { \"fp32Contents\" : [ 6 .7366788e-9,1.1535991e-8,5.1422507e-8,6.6471706e-8,4.0944926e-8,1.3402452e-7,9.355561e-8,2.8935892e-8,6.845367e-8,7.680616e-8,0.000002033469,0.0000011085679,2.3477592e-7,6.582037e-7,0.00012373104,4.2878804e-7,0.00000641996,0.99934965,0.000073720024,0.000031011357,0.0000056028093,0.0000021862509,1.9544045e-8,3.728894e-7,4.2903633e-7,1.825118e-7,7.159926e-8,9.231618e-9,6.4692415e-7,7.0316903e-9,4.4512316e-8,1.2455972e-7,9.4463275e-8,4.3477044e-8,4.65822e-7,6.797721e-8,2.1060276e-7,2.2605123e-8,1.431149e-7,7.9512986e-8,1.2341783e-7,0.0000010921714,0.000015243892,3.1173343e-7,2.4152058e-7,6.8637625e-8,8.467682e-8,9.424677e-8,1.021921e-8,3.3770753e-8,3.6928835e-8,1.3694032e-7,1.06742846e-7,2.5994837e-7,3.4866406e-7,3.1320535e-8,3.5748732e-7,6.648439e-8,3.1638956e-7,0.0000012095878,8.66409e-8,4.0144172e-8,1.2544761e-7,3.320118e-8,1.9731445e-7,3.8064056e-7,1.3827865e-7,2.300226e-8,7.144225e-8,2.8511145e-8,2.9825674e-8,8.936033e-8,6.2238837e-7,6.478839e-8,1.3663023e-7,9.9731814e-8,2.5761555e-8,4.13022e-8,3.9384464e-8,1.215808e-7,0.0000043028217,0.0000018179063,0.0000018520155,0.0000016246107,0.000016448314,0.000010544916,0.000003993062,2.6464798e-7,0.000019193476,4.803243e-7,1.696285e-7,0.0000045505058,0.000042359294,0.0000044433386,0.0000051040097,0.0000135063965,4.1758724e-7,4.4944915e-7,3.1566984e-7,0.00000105576,1.3364639e-8,1.389366e-8,6.7703795e-8,1.4129697e-7,7.170519e-8,7.9344666e-8,2.6391543e-8,2.6134321e-8,7.196726e-9,2.1752363e-8,6.68464e-8,3.417796e-8,1.6228276e-7,4.1071146e-7,6.4721354e-7,2.9513794e-7,5.653474e-9,4.8301445e-8,8.887482e-9,3.730617e-8,1.7784265e-8,4.641905e-9,3.4131187e-8,1.9373938e-7,0.0000012980177,3.5641005e-8,2.1493324e-8,3.0552937e-7,1.5532517e-7,0.0000014520979,3.4884646e-8,0.000038254384,4.5088433e-7,4.176697e-7,6.7706225e-7,1.4142249e-7,0.000014235998,6.2938204e-7,0.000004762867,9.0249006e-7,9.058988e-7,0.0000015713684,1.5720647e-7,1.8185365e-7,7.1931886e-8,0.0000011952825,8.8748374e-7,2.0870831e-7,9.906239e-8,7.793622e-9,1.00584984e-7,4.205944e-7,1.8436246e-7,1.6437947e-7,7.025353e-8,2.5704486e-7,7.5868776e-8,7.841314e-7,2.4953093e-7,5.157682e-8,4.0674127e-8,7.5317965e-9,4.7974854e-8,1.7419973e-8,1.7958679e-7,1.2566392e-8,8.97544e-8,3.2696548e-8,1.120836e-7,3.9067462e-8,4.6769046e-8,1.8523554e-7,1.4833053e-7,1.227935e-7,0.0000010729105,3.653849e-9,1.6198403e-7,1.619072e-8,1.2004934e-7,1.48002774e-8,4.0229484e-8,2.150609e-7,1.1925697e-7,4.8982514e-8,7.60892e-8,2.313748e-8,8.52105e-8,9.5862134e-8,1.3351651e-7,3.021699e-8,4.4238764e-8,2.610667e-8,2.397709e-7,1.3192565e-7,1.6734932e-8,1.588337e-7,4.0643516e-7,8.7534545e-8,8.3669994e-7,3.4375987e-8,7.847893e-8,8.526395e-9,9.601383e-8,5.258924e-7,1.3557448e-7,1.0307227e-7,1.04298135e-8,5.1877144e-8,2.1870013e-8,1.179144e-8,2.9806564e-8,4.3383935e-8,2.9991046e-8,2.850761e-8,3.058665e-8,6.441099e-8,1.5364101e-8,1.5973884e-8,2.573685e-8,1.0903766e-7,3.2118738e-8,6.819743e-9,1.9251311e-7,5.825811e-8,1.8765762e-7,4.007079e-7,1.5791578e-8,1.9501584e-7,1.0142063e-8,2.744815e-8,1.2843532e-8,3.7297493e-8,7.407497e-8,4.2060783e-8,1.6924805e-8,1.4592035e-7,4.344977e-8,1.7191404e-7,3.5817443e-8,8.440249e-9,4.1948297e-8,2.5140324e-8,2.83402e-8,8.747196e-8,8.277126e-9,1.1676294e-8,1.4548515e-7,7.200282e-9,0.000002623601,5.675737e-7,0.0000019483527,6.752595e-8,8.168475e-8,1.09330465e-7,1.6709137e-7,3.1387277e-8,2.9735245e-8,5.752164e-8,5.8508775e-8,3.2544622e-7,3.3302214e-8,4.1867867e-7,1.5085907e-7,2.334682e-7,2.8640278e-7,2.294032e-7,1.8537604e-7,3.1517982e-7,0.0000011075967,1.5369783e-7,1.923751e-7, 1 .6404466e-7,2.9008353e-7,1.2466549e-7,5.802622e-8,5.1862205e-8,6.0094205e-9,1.2333241e-7,1.3798474e-7,1.7370232e-7,5.617761e-7,5.160447e-8,4.8132776e-8,8.0326984e-8,0.0000020645264,5.6385977e-7,8.7942e-7,0.000003478598,2.9723893e-7,3.3904533e-7,9.469074e-8,3.7548457e-8,1.5679038e-7,8.203105e-8,6.8479626e-9,1.8251624e-8,6.050241e-8,3.9563428e-8,1.0699948e-7,3.2566635e-7,3.536943e-7,7.326295e-8,4.857656e-7,7.7177134e-7,3.456778e-8,3.2462046e-7,0.0000031608602,5.3309947e-8,3.6456873e-7,5.4815894e-7,4.6230696e-8,1.3466178e-7,4.3529482e-8,1.6404105e-7,2.4636954e-8,5.9587126e-8,9.493651e-8,5.5234626e-8,5.7412358e-8,0.00001185035,0.0000058263945,0.0000074208674,9.127966e-7,0.0000020019581,0.000001033499,3.514685e-8,0.0000020589953,3.565551e-7,6.873234e-8,2.1935298e-9,5.5603635e-8,3.3266997e-7,1.3073692e-7,2.718763e-8,1.0462929e-8,7.4666804e-7,6.923166e-8,1.6145664e-8,8.568521e-9,4.76221e-9,1.2339771e-7,8.340629e-9,3.2649041e-9,5.063249e-9,4.0704995e-9,1.2043539e-8,5.1056084e-9,7.267143e-9,1.1845163e-7,7.535579e-8,6.386964e-8,1.6212937e-8,2.6104294e-7,6.9794254e-7,6.647513e-8,7.717493e-7,6.651207e-7,3.3244953e-7,3.707282e-7,3.9956424e-7,6.411632e-8,7.107352e-8,1.6380017e-7,6.876801e-8,3.4624745e-7,2.0256503e-7,6.1961015e-7,2.6841073e-8,6.7203354e-7,0.0000011348341,0.0000018397932,6.3972516e-7,7.257533e-8,4.221391e-7,3.9657925e-7,1.403744e-7,3.2498568e-7,1.5857655e-7,1.1122217e-7,7.391421e-8,3.4232224e-7,5.3979615e-8,8.5172964e-8,0.00000406101,0.0000144787555,7.3170328e-9,6.948496e-9,4.4689173e-8,9.231412e-8,5.4119823e-8,2.2242811e-7,1.7609555e-8,2.090628e-8,3.6797683e-9,6.17792e-8,1.7920289e-7,2.627918e-8,2.69882e-8,1.6432807e-7,1.2827613e-7,4.4689088e-8,6.316553e-8,1.946176e-8,2.0871258e-8,2.241458e-8,2.4765244e-8,6.7853985e-9,2.4248795e-8,4.5549795e-9,2.8977038e-8,2.0402325e-8,1.6009503e-7,2.019971e-7,1.6111885e-8,5.964114e-8,4.0983186e-9,3.9080128e-8,7.511338e-9,5.965624e-7,1.6478224e-7,1.4106989e-8,3.2855585e-8,3.3387166e-9,1.2200434e-8,4.624639e-8,6.8423094e-9,1.7426288e-8,4.661133e-8,9.331948e-8,1.2306079e-7,1.2359445e-8,1.1173199e-8,2.7724862e-8,2.4192101e-7,3.4511868e-7,2.593767e-8,9.964568e-8,9.79781e-9,1.9085564e-7,3.9727063e-8,2.6639205e-8,6.874149e-9,3.1469938e-8,2.4086594e-7,1.3126927e-7,2.125434e-7,2.0502034e-8,3.694976e-8,6.563176e-7,2.5600501e-8,2.6882981e-8,6.880636e-7,2.0092733e-7,2.7880397e-8,2.6284091e-8,5.1678345e-8,1.8935414e-7,4.6185284e-7,1.1086778e-8,1.4542604e-7,2.873701e-8,6.105168e-7,1.2016463e-8,1.3944705e-7,2.0937128e-8,4.380141e-8,1.9663208e-8,6.654449e-9,1.1149591e-8,6.424939e-8,6.971555e-9,3.2600196e-9,1.42601895e-8,2.7895078e-8,8.115783e-8,2.5995716e-8,2.2855579e-8,1.05596285e-7,8.1455426e-8,3.7793686e-8,4.8818915e-8,2.3420624e-8,1.0599355e-8,3.604105e-8,5.062431e-8,3.680444e-8,1.5015802e-9,0.0000014475033,0.0000010762104,1.3049913e-7,3.0736015e-8,1.7184021e-8,2.042109e-8,7.904992e-9,1.6902052e-7,1.2416507e-8,5.4758292e-8,2.6250422e-8,1.3261367e-8,6.2980746e-8,1.2709986e-8,2.0171682e-7,4.3866372e-8,6.9623496e-8,2.956512e-7,7.9251316e-7,2.086892e-7,1.7341794e-7,4.2942418e-8,4.213407e-9,8.824785e-8,1.734157e-8,7.321587e-8,1.7941774e-8,1.1245148e-7,4.2424054e-7,8.2595735e-9,1.1336403e-7,8.268799e-8,2.2186978e-8,1.9539721e-8,1.0675704e-8,3.2885175e-8,2.4340963e-8,6.639137e-8,5.6046874e-9,1.3866047e-8,6.675874e-8,1.1355886e-8,3.1321596e-7,3.124518e-8,1.5021818e-7,1.3461754e-8,1.8882956e-7,4.6457423e-8,4.645388e-8,7.714454e-9,3.5857155e-8,7.608321e-9,4.2215014e-8,4.340725e-9,1.3401575e-8,8.5656005e-8,1.7045414e-8,5.4221903e-8,3.0219127e-8,6.153376e-8,3.9388572e-9, 4 .135628e-8,1.7819204e-8,4.3105885e-8,3.903355e-9,7.6630855e-8,1.1890406e-8,9.304218e-9,1.0968062e-9,1.0536768e-8,1.1516804e-7,8.134523e-7,5.952624e-8,2.8063502e-8,1.2833099e-8,1.060569e-7,7.87295e-7,2.7501393e-8,3.936289e-9,2.0519442e-8,7.394816e-9,3.5983973e-8,2.5378517e-8,4.6989722e-8,7.54953e-9,6.3228055e-7,5.5820064e-9,1.2964098e-7,1.5874988e-8,3.383781e-8,6.474512e-9,9.121148e-8,1.3918512e-8,8.2300255e-9,2.706129e-8,2.6095918e-8,5.7223635e-9,6.9634757e-7,4.6850918e-8,9.59058e-9,2.0992059e-7,3.0821607e-8,3.5631626e-8,7.326313e-7,0.0000021759731,2.4075183e-7,2.9745158e-7,2.5290184e-8,7.667951e-9,2.6632893e-7,3.435888e-8,2.3130198e-8,3.1239693e-8,2.8691622e-7,3.895845e-8,2.418413e-8,1.1582445e-8,5.154535e-8,2.0343455e-8,8.201963e-8,1.1641536e-8,5.496357e-7,1.1682151e-8,4.7576915e-8,1.6349825e-8,4.0908628e-8,2.127119e-7,1.6697287e-7,3.989708e-8,0.0000028524503,1.2500372e-7,2.4846614e-7,1.2454291e-8,2.9700272e-8,4.2509916e-9,1.6144348e-7,2.638602e-7,7.638056e-9,3.4455794e-9,7.2732895e-8,1.7631434e-8,7.586613e-9,2.1547013e-8,1.2675349e-7,2.563715e-8,3.5009762e-8,6.4722435e-8,8.387915e-9,3.0695123e-8,7.520388e-8,1.5724964e-7,1.9634005e-7,1.2290832e-7,1.1121187e-9,1.546896e-8,9.91701e-9,6.882473e-7,8.2676166e-8,4.4695312e-8,2.0752013e-8,8.6493785e-8,5.2027666e-8,4.5564942e-8,2.0319955e-8,8.705182e-9,6.452067e-8,2.1777439e-8,1.0309542e-8,3.2119043e-8,2.3336936e-7,8.054096e-9,1.9623354e-7,1.288809e-7,1.5392496e-8,1.401903e-9,5.6968183e-8,6.0800254e-9,1.0782793e-8,2.426073e-8,1.938866e-8,2.297031e-7,1.9971754e-8,2.8477993e-8,5.2273553e-8,2.7392807e-7,9.857291e-8,3.1291098e-8,4.1514422e-8,5.2511964e-9,0.0000015806811,8.5476034e-7,1.0689131e-8,0.0000010621831,7.737313e-8,6.394217e-8,1.1698346e-7,1.04866096e-7,2.1161e-7,1.5339682e-8,5.0944536e-8,1.400538e-8,2.6282036e-8,8.7784336e-8,7.7720665e-9,4.2288754e-8,3.324378e-7,7.729245e-8,7.636901e-10,5.989501e-8,1.3260906e-7,1.2853634e-7,8.844243e-9,1.0194375e-7,2.4937793e-7,1.6547972e-7,1.1762754e-8,1.14961956e-7,2.934271e-7,1.3261241e-8,8.630263e-8,5.7394843e-8,1.1094081e-7,2.2933713e-7,3.470617e-7,1.4751107e-7,1.502495e-8,6.4543194e-8,5.1645337e-8,6.2374156e-8,1.2936015e-7,1.40520715e-8,5.386946e-8,2.0827555e-8,1.3040638e-8,1.05789816e-7,1.5079728e-8,8.926327e-7,4.637438e-8,7.481006e-7,5.8831473e-9,2.8707685e-9,8.3815985e-7,7.3419586e-9,1.4245998e-8,1.0926417e-7,1.1308178e-7,2.523399e-7,1.1782836e-7,4.6678057e-8,2.7959197e-9,3.436386e-8,1.4674497e-7,3.5396916e-8,2.0581128e-7,7.183879e-8,2.7693943e-8,4.5493387e-8,1.9559183e-8,1.5359708e-8,1.2336623e-8,2.9570606e-8,2.8775526e-7,9.0518455e-7,2.3732602e-7,1.6521676e-8,1.5478875e-8,3.5267863e-8,3.6164106e-8,1.6159095e-8,7.650073e-8,1.9661483e-8,4.917535e-8,1.1712613e-7,1.0889253e-8,0.0000014941202,1.0185857e-8,3.757597e-8,2.0970978e-8,3.368558e-8,4.845589e-9,6.0396246e-7,1.0373311e-8,2.8416503e-7,4.499063e-7,3.463186e-8,7.720684e-8,1.4711222e-7,1.16015755e-7,4.007488e-7,3.0256498e-8,6.7067845e-8,2.0128741e-8,1.5987744e-9,4.1919822e-8,1.31671545e-8,3.2318148e-8,9.24766e-8,1.3075301e-7,1.0574301e-7,3.7621653e-8,1.09422466e-7,7.001475e-8,2.7427062e-8,2.0766626e-8,4.5403404e-8,3.390403e-8,1.0469662e-7,2.8271579e-8,3.4062268e-7,5.146207e-7,6.7407086e-7,6.382248e-9,3.638787e-8,3.6260598e-8,1.6065603e-7,3.639056e-7,6.2326917e-9,4.80549e-8,3.3726337e-8,6.3288803e-7,6.4806315e-8,2.1165198e-7,8.3877914e-8,1.7589144e-8,2.7290277e-9,2.1447951e-8,7.8612715e-8,2.0118186e-8,2.8407685e-8,2.492253e-7,2.0156671e-8,2.655165e-8,2.7848243e-8,6.9071238e-9,1.8805437e-8,1.3006904e-8,3.6859183e-7,3.9679412e-7,2.7592133e-8,2.5228948e-8,1.5470029e-7,3.6893066e-8,1.4401772e-9,2.150493e-8,5.0681113e-8,5.0817114e-8,1.1718752e-8,5.4092784e-8,7.1382766e-7,2.5237213e-7,7.0720446e-8,7.199764e-8,1.2525473e-8,3.4803418e-7,1.9591828e-7,1.24047e-7,1.2346175e-7,1.9201337e-8,1.9895249e-7,3.7876678e-8,1.0629785e-8,1.2437128e-8,2.1861892e-7,2.6181456e-7,1.1129008e-7,1.07766304e-7,6.380325e-9,3.895085e-9,1.5762757e-7,2.909027e-9,1.0381051e-8,2.8135211e-8,1.07780025e-8,1.3605974e-8,2.9236466e-8,1.9189795e-7,2.1995064e-7,1.3263998e-8,4.9004846e-8,2.9808371e-9,8.926046e-9,1.0996976e-8,7.7156015e-9,7.454491e-9,5.0861622e-8,1.5129764e-7,1.1960076e-8,1.1323334e-8,9.391332e-9,9.585702e-8,1.9055328e-8,1.8105304e-8,6.179228e-8,6.3894014e-8,1.185318e-8,9.3727754e-9,1.2332148e-7,1.6522023e-8,1.2461165e-7,4.196172e-9,3.9965933e-8,1.25545565e-8,1.4302138e-8,6.631794e-9,5.9642247e-9,5.5569362e-9,1.4192456e-7,1.761308e-8,3.3801896e-7,7.856519e-8,2.966783e-8,0.0000028992106,0.0000013787367,5.3136224e-9,2.5128529e-8,8.406627e-8,4.492839e-8,5.408793e-8,2.4239176e-8,4.0168052e-7,4.1083545e-8,5.415348e-8,8.640767e-9,5.7732567e-8,2.6443152e-7,8.953217e-7,2.7994002e-8,5.88948e-9,4.178882e-8,2.8880645e-8,2.1357529e-8,2.3024175e-7,8.7866255e-8,2.0697297e-9,2.2364105e-8,3.2032763e-9,1.1768747e-8,6.963571e-8,2.2719322e-8,7.3603825e-9,6.9225288e-9,3.2138715e-8,1.3705778e-7,1.981505e-8,1.0578956e-8,2.704942e-8,2.9755938e-9,2.17737e-8,1.0975539e-8,1.9918724e-8,2.3882098e-8,2.1357366e-8,6.1090986e-9,1.18904975e-8,1.1459892e-8,3.7317346e-8,1.5726203e-8,3.4040234e-8,3.692158e-8,9.281765e-8,1.2323201e-7,4.2347594e-8,1.7423728e-8,5.811339e-8,3.931436e-8,2.3690461e-8,1.7928501e-8,1.4406642e-8,7.0198305e-9,6.0415225e-8,4.86748e-8,1.0685319e-8,1.0051243e-8,4.242626e-8,2.6078153e-8,5.13667e-9,1.6972995e-9,1.9131587e-8,2.1117435e-7,1.3372697e-8,2.0002481e-8,1.04542565e-7,2.8144228e-8,2.1344792e-7,2.104611e-8,1.9114454e-7,3.9576936e-8,2.9316318e-8,1.1052031e-7,4.8400768e-8,5.583606e-8,1.2130111e-7,1.7762162e-8,2.5610854e-8,1.2038653e-7,4.6748596e-9,1.5916099e-8,3.1475942e-8,6.147686e-8,2.2046418e-8,3.2577634e-7,1.1989145e-7,2.381899e-7,1.4909986e-8,5.1016883e-8,5.5142202e-8,2.2885503e-8,5.71411e-8,5.185096e-7,4.9772858e-8,1.1049896e-8,1.2640993e-7,8.1748816e-8 ]} } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/pmml/","text":"Deploy PMML model with InferenceService \u00b6 PMML, or predictive model markup language, is an XML format for describing data mining and statistical models, including inputs to the models, transformations used to prepare data for data mining, and the parameters that define the models themselves. In this example we show how you can serve the PMML format model on InferenceService . Deploy PMML model with V1 protocol \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml Create the InferenceService with above yaml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = pmml-demo INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8081 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 18 Oct 2020 15 :50:02 GMT < server: istio-envoy < x-envoy-upstream-service-time: 12 < * Connection #0 to host localhost left intact { \"predictions\" : [{ 'Species' : 'setosa' , 'Probability_setosa' : 1 .0, 'Probability_versicolor' : 0 .0, 'Probability_virginica' : 0 .0, 'Node_Id' : '2' }]} * Closing connection 0 Deploy the model with Open Inference Protocol \u00b6 Test the Model locally \u00b6 Once you've got your model serialised model.pmml , we can then use KServe Pmml Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Using KServe PMMLServer \u00b6 Pre-requisites \u00b6 Firstly, to use KServe pmml server locally, you will first need to install the pmmlserver runtime package in your local environment. Install OpenJdk-11 . Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install pmmlserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/pmmlserver poetry install Serving model locally \u00b6 The pmmlserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the pmmlserver runtime package installed locally, you should now be ready to start our server as: python3 pmmlserver --model_dir /path/to/model_dir --model_name pmml-v2-iris Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f pmml-v2-iris.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/pmml-v2-iris/infer Expected Output { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f pmml-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"pmml-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"PMML"},{"location":"modelserving/v1beta1/pmml/#deploy-pmml-model-with-inferenceservice","text":"PMML, or predictive model markup language, is an XML format for describing data mining and statistical models, including inputs to the models, transformations used to prepare data for data mining, and the parameters that define the models themselves. In this example we show how you can serve the PMML format model on InferenceService .","title":"Deploy PMML model with InferenceService"},{"location":"modelserving/v1beta1/pmml/#deploy-pmml-model-with-v1-protocol","text":"","title":"Deploy PMML model with V1 protocol"},{"location":"modelserving/v1beta1/pmml/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml Create the InferenceService with above yaml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size.","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/pmml/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = pmml-demo INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8081 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 18 Oct 2020 15 :50:02 GMT < server: istio-envoy < x-envoy-upstream-service-time: 12 < * Connection #0 to host localhost left intact { \"predictions\" : [{ 'Species' : 'setosa' , 'Probability_setosa' : 1 .0, 'Probability_versicolor' : 0 .0, 'Probability_virginica' : 0 .0, 'Node_Id' : '2' }]} * Closing connection 0","title":"Run a prediction"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/pmml/#test-the-model-locally","text":"Once you've got your model serialised model.pmml , we can then use KServe Pmml Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/pmml/#using-kserve-pmmlserver","text":"","title":"Using KServe PMMLServer"},{"location":"modelserving/v1beta1/pmml/#pre-requisites","text":"Firstly, to use KServe pmml server locally, you will first need to install the pmmlserver runtime package in your local environment. Install OpenJdk-11 . Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install pmmlserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/pmmlserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/pmml/#serving-model-locally","text":"The pmmlserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the pmmlserver runtime package installed locally, you should now be ready to start our server as: python3 pmmlserver --model_dir /path/to/model_dir --model_name pmml-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f pmml-v2-iris.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/pmml/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/pmml-v2-iris/infer Expected Output { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f pmml-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/pmml/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"pmml-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/rollout/canary-example/","text":"Canary Rollout Example \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . Create the InferenceService \u00b6 Complete steps 1-3 in the First Inference Service tutorial. Set up a namespace (if not already created), and create an InferenceService. After rolling out the first model, 100% traffic goes to the initial model with service revision 1. Run kubectl get isvc sklearn-iris in the command line to see the amount of traffic routing to the InferenceService under the LATEST column. NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00001 46s 2m39s 70s Update the InferenceService with the canary rollout strategy \u00b6 Add the canaryTrafficPercent field to the predictor component and update the storageUri to use a new/updated model. NOTE: A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 10 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 10 sklearn: storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF After rolling out the canary model, traffic is split between the latest ready revision 2 and the previously rolled out revision 1. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 90 10 sklearn-iris-predictor-default-00001 sklearn-iris-predictor-default-00002 9m19s Check the running pods, you should now see port two pods running for the old and new model and 10% traffic is routed to the new model. Notice revision 1 contains default-0001 in its name, while revision 2 contains default-0002 . kubectl get pods NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 2 /2 Running 0 11m sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 12m Run a prediction \u00b6 Follow the next two steps ( Determine the ingress IP and ports and Perform inference ) in the First Inference Service tutorial. Send more requests to the InferenceService to observe the 10% of traffic that routes to the new revision. Promote the canary model \u00b6 If the canary model is healthy/passes your tests, you can promote it by removing the canaryTrafficPercent field and re-applying the InferenceService custom resource. kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Now all traffic goes to the revision 2 for the new model. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00002 17m The pods for revision generation 1 automatically scales down to 0 as it is no longer getting the traffic. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 1 /2 Terminating 0 17m sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 15m Rollback and pin the previous model \u00b6 You can pin the previous model (model v1, for example) by setting the canaryTrafficPercent to 0 for the current model (model v2, for example). This rolls back from model v2 to model v1 and decreases model v2's traffic to zero. Apply the custom resource to set model v2's traffic to 0%. kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 0 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Check the traffic split, now 100% traffic goes to the previous good model (model v1) for revision generation 1. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 0 sklearn-iris-predictor-default-00001 sklearn-iris-predictor-default-00002 18m The pods for previous revision (model v1) now routes 100% of the traffic to its pods while the new model (model v2) routes 0% traffic to its pods. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 1 /2 Running 0 35s sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 16m Route traffic using a tag \u00b6 You can enable tag based routing by adding the annotation serving.kserve.io/enable-tag-routing , so traffic can be explicitly routed to the canary model (model v2) or the old model (model v1) via a tag in the request URL. Apply model v2 with canaryTrafficPercent: 10 and serving.kserve.io/enable-tag-routing: \"true\" . kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" annotations: serving.kserve.io/enable-tag-routing: \"true\" spec: predictor: canaryTrafficPercent: 10 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Check the InferenceService status to get the canary and previous model URL. kubectl get isvc sklearn-iris -ojsonpath = \"{.status.components.predictor}\" | jq The output should look like Expected Output { \"address\" : { \"url\" : \"http://sklearn-iris-predictor-default.kserve-test.svc.cluster.local\" }, \"latestCreatedRevision\" : \"sklearn-iris-predictor-default-00003\" , \"latestReadyRevision\" : \"sklearn-iris-predictor-default-00003\" , \"latestRolledoutRevision\" : \"sklearn-iris-predictor-default-00001\" , \"previousRolledoutRevision\" : \"sklearn-iris-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 10 , \"revisionName\" : \"sklearn-iris-predictor-default-00003\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-sklearn-iris-predictor-default.kserve-test.example.com\" }, { \"latestRevision\" : false , \"percent\" : 90 , \"revisionName\" : \"sklearn-iris-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-sklearn-iris-predictor-default.kserve-test.example.com\" } ], \"url\" : \"http://sklearn-iris-predictor-default.kserve-test.example.com\" } Since we updated the annotation on the InferenceService , model v2 now corresponds to sklearn-iris-predictor-default-00003 . You can now send the request explicitly to the new model or the previous model by using the tag in the request URL. Use the curl command from Perform inference and add latest- or prev- to the model name to send a tag based request. For example, set the model name and use the following commands to send traffic to each service based on the latest or prev tag. MODEL_NAME = sklearn-iris curl the latest revision curl -v -H \"Host: latest- ${ MODEL_NAME } -predictor-default.kserve-test.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d @./iris-input.json or curl the previous revision curl -v -H \"Host: prev- ${ MODEL_NAME } -predictor-default.kserve-test.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d @./iris-input.json","title":"Canary Example"},{"location":"modelserving/v1beta1/rollout/canary-example/#canary-rollout-example","text":"","title":"Canary Rollout Example"},{"location":"modelserving/v1beta1/rollout/canary-example/#setup","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible .","title":"Setup"},{"location":"modelserving/v1beta1/rollout/canary-example/#create-the-inferenceservice","text":"Complete steps 1-3 in the First Inference Service tutorial. Set up a namespace (if not already created), and create an InferenceService. After rolling out the first model, 100% traffic goes to the initial model with service revision 1. Run kubectl get isvc sklearn-iris in the command line to see the amount of traffic routing to the InferenceService under the LATEST column. NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00001 46s 2m39s 70s","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/rollout/canary-example/#update-the-inferenceservice-with-the-canary-rollout-strategy","text":"Add the canaryTrafficPercent field to the predictor component and update the storageUri to use a new/updated model. NOTE: A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 10 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 10 sklearn: storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF After rolling out the canary model, traffic is split between the latest ready revision 2 and the previously rolled out revision 1. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 90 10 sklearn-iris-predictor-default-00001 sklearn-iris-predictor-default-00002 9m19s Check the running pods, you should now see port two pods running for the old and new model and 10% traffic is routed to the new model. Notice revision 1 contains default-0001 in its name, while revision 2 contains default-0002 . kubectl get pods NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 2 /2 Running 0 11m sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 12m","title":"Update the InferenceService with the canary rollout strategy"},{"location":"modelserving/v1beta1/rollout/canary-example/#run-a-prediction","text":"Follow the next two steps ( Determine the ingress IP and ports and Perform inference ) in the First Inference Service tutorial. Send more requests to the InferenceService to observe the 10% of traffic that routes to the new revision.","title":"Run a prediction"},{"location":"modelserving/v1beta1/rollout/canary-example/#promote-the-canary-model","text":"If the canary model is healthy/passes your tests, you can promote it by removing the canaryTrafficPercent field and re-applying the InferenceService custom resource. kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Now all traffic goes to the revision 2 for the new model. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00002 17m The pods for revision generation 1 automatically scales down to 0 as it is no longer getting the traffic. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 1 /2 Terminating 0 17m sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 15m","title":"Promote the canary model"},{"location":"modelserving/v1beta1/rollout/canary-example/#rollback-and-pin-the-previous-model","text":"You can pin the previous model (model v1, for example) by setting the canaryTrafficPercent to 0 for the current model (model v2, for example). This rolls back from model v2 to model v1 and decreases model v2's traffic to zero. Apply the custom resource to set model v2's traffic to 0%. kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 0 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Check the traffic split, now 100% traffic goes to the previous good model (model v1) for revision generation 1. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 0 sklearn-iris-predictor-default-00001 sklearn-iris-predictor-default-00002 18m The pods for previous revision (model v1) now routes 100% of the traffic to its pods while the new model (model v2) routes 0% traffic to its pods. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 1 /2 Running 0 35s sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 16m","title":"Rollback and pin the previous model"},{"location":"modelserving/v1beta1/rollout/canary-example/#route-traffic-using-a-tag","text":"You can enable tag based routing by adding the annotation serving.kserve.io/enable-tag-routing , so traffic can be explicitly routed to the canary model (model v2) or the old model (model v1) via a tag in the request URL. Apply model v2 with canaryTrafficPercent: 10 and serving.kserve.io/enable-tag-routing: \"true\" . kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" annotations: serving.kserve.io/enable-tag-routing: \"true\" spec: predictor: canaryTrafficPercent: 10 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Check the InferenceService status to get the canary and previous model URL. kubectl get isvc sklearn-iris -ojsonpath = \"{.status.components.predictor}\" | jq The output should look like Expected Output { \"address\" : { \"url\" : \"http://sklearn-iris-predictor-default.kserve-test.svc.cluster.local\" }, \"latestCreatedRevision\" : \"sklearn-iris-predictor-default-00003\" , \"latestReadyRevision\" : \"sklearn-iris-predictor-default-00003\" , \"latestRolledoutRevision\" : \"sklearn-iris-predictor-default-00001\" , \"previousRolledoutRevision\" : \"sklearn-iris-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 10 , \"revisionName\" : \"sklearn-iris-predictor-default-00003\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-sklearn-iris-predictor-default.kserve-test.example.com\" }, { \"latestRevision\" : false , \"percent\" : 90 , \"revisionName\" : \"sklearn-iris-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-sklearn-iris-predictor-default.kserve-test.example.com\" } ], \"url\" : \"http://sklearn-iris-predictor-default.kserve-test.example.com\" } Since we updated the annotation on the InferenceService , model v2 now corresponds to sklearn-iris-predictor-default-00003 . You can now send the request explicitly to the new model or the previous model by using the tag in the request URL. Use the curl command from Perform inference and add latest- or prev- to the model name to send a tag based request. For example, set the model name and use the following commands to send traffic to each service based on the latest or prev tag. MODEL_NAME = sklearn-iris curl the latest revision curl -v -H \"Host: latest- ${ MODEL_NAME } -predictor-default.kserve-test.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d @./iris-input.json or curl the previous revision curl -v -H \"Host: prev- ${ MODEL_NAME } -predictor-default.kserve-test.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d @./iris-input.json","title":"Route traffic using a tag"},{"location":"modelserving/v1beta1/rollout/canary/","text":"Canary Rollout Strategy \u00b6 KServe supports canary rollouts for inference services. Canary rollouts allow for a new version of an InferenceService to receive a percentage of traffic. Kserve supports a configurable canary rollout strategy with multiple steps. The rollout strategy can also be implemented to rollback to the previous revision if a rollout step fails. KServe automatically tracks the last good revision that was rolled out with 100% traffic. The canaryTrafficPercent field in the component's spec needs to be set with the percentage of traffic that should be routed to the new revision. KServe will then automatically split the traffic between the last good revision and the revision that is currently being rolled out according to the canaryTrafficPercent value. When the first revision of an InferenceService is deployed, it will receive 100% of the traffic. When multiple revisions are deployed, as in step 2, and the canary rollout strategy is configured to route 10% of the traffic to the new revision, 90% of the traffic will go to the LastestRolledoutRevision . If there is an unhealthy or bad revision applied, traffic will not be routed to that bad revision. In step 3, the rollout strategy promotes the LatestReadyRevision from step 2 to the LatestRolledoutRevision . Since it is now promoted, the LatestRolledoutRevision gets 100% of the traffic and is fully rolled out. If a rollback needs to happen, 100% of the traffic will be pinned to the previous healthy/good revision- the PreviousRolledoutRevision .","title":"Canary"},{"location":"modelserving/v1beta1/rollout/canary/#canary-rollout-strategy","text":"KServe supports canary rollouts for inference services. Canary rollouts allow for a new version of an InferenceService to receive a percentage of traffic. Kserve supports a configurable canary rollout strategy with multiple steps. The rollout strategy can also be implemented to rollback to the previous revision if a rollout step fails. KServe automatically tracks the last good revision that was rolled out with 100% traffic. The canaryTrafficPercent field in the component's spec needs to be set with the percentage of traffic that should be routed to the new revision. KServe will then automatically split the traffic between the last good revision and the revision that is currently being rolled out according to the canaryTrafficPercent value. When the first revision of an InferenceService is deployed, it will receive 100% of the traffic. When multiple revisions are deployed, as in step 2, and the canary rollout strategy is configured to route 10% of the traffic to the new revision, 90% of the traffic will go to the LastestRolledoutRevision . If there is an unhealthy or bad revision applied, traffic will not be routed to that bad revision. In step 3, the rollout strategy promotes the LatestReadyRevision from step 2 to the LatestRolledoutRevision . Since it is now promoted, the LatestRolledoutRevision gets 100% of the traffic and is fully rolled out. If a rollback needs to happen, 100% of the traffic will be pinned to the previous healthy/good revision- the PreviousRolledoutRevision .","title":"Canary Rollout Strategy"},{"location":"modelserving/v1beta1/sklearn/v2/","text":"Deploy Scikit-learn models with InferenceService \u00b6 This example walks you through how to deploy a scikit-learn model leveraging the v1beta1 version of the InferenceService CRD. Note that, by default the v1beta1 version will expose your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through Open Inference Protocol . Train the Model \u00b6 The first step will be to train a sample scikit-learn model. Note that this model will be then saved as model.joblib . from sklearn import svm from sklearn import datasets from joblib import dump iris = datasets . load_iris () X , y = iris . data , iris . target clf = svm . SVC ( gamma = 'scale' ) clf . fit ( X , y ) dump ( clf , 'model.joblib' ) Test the Model locally \u00b6 Once you've got your model serialised model.joblib , we can then use KServe Sklearn Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Using KServe SklearnServer \u00b6 Pre-requisites \u00b6 Firstly, to use KServe sklearn server locally, you will first need to install the sklearnserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install sklearnserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/sklearnserver poetry install Serving model locally \u00b6 The sklearnserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the sklearnserver runtime package installed locally, you should now be ready to start our server as: python3 sklearnserver --model_dir /path/to/model_dir --model_name sklearn-v2-iris Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Note that this makes the following assumptions: Your model weights (i.e. your model.joblib file) have already been uploaded to a \"model repository\" (GCS in this example) and can be accessed as gs://seldon-models/sklearn/iris . There is a K8s cluster available, accessible through kubectl . KServe has already been installed in your cluster . kubectl kubectl apply -f sklearn.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/sklearn-v2-iris/infer Expected Output { \"id\" : \"823248cc-d770-4a51-9606-16803395569c\" , \"model_name\" : \"sklearn-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1 , 1 ], \"datatype\" : \"INT64\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f sklearn-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"sklearn-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"sklearn-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Scikit-learn"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-scikit-learn-models-with-inferenceservice","text":"This example walks you through how to deploy a scikit-learn model leveraging the v1beta1 version of the InferenceService CRD. Note that, by default the v1beta1 version will expose your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through Open Inference Protocol .","title":"Deploy Scikit-learn models with InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#train-the-model","text":"The first step will be to train a sample scikit-learn model. Note that this model will be then saved as model.joblib . from sklearn import svm from sklearn import datasets from joblib import dump iris = datasets . load_iris () X , y = iris . data , iris . target clf = svm . SVC ( gamma = 'scale' ) clf . fit ( X , y ) dump ( clf , 'model.joblib' )","title":"Train the Model"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-model-locally","text":"Once you've got your model serialised model.joblib , we can then use KServe Sklearn Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/sklearn/v2/#using-kserve-sklearnserver","text":"","title":"Using KServe SklearnServer"},{"location":"modelserving/v1beta1/sklearn/v2/#pre-requisites","text":"Firstly, to use KServe sklearn server locally, you will first need to install the sklearnserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install sklearnserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/sklearnserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/sklearn/v2/#serving-model-locally","text":"The sklearnserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the sklearnserver runtime package installed locally, you should now be ready to start our server as: python3 sklearnserver --model_dir /path/to/model_dir --model_name sklearn-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Note that this makes the following assumptions: Your model weights (i.e. your model.joblib file) have already been uploaded to a \"model repository\" (GCS in this example) and can be accessed as gs://seldon-models/sklearn/iris . There is a K8s cluster available, accessible through kubectl . KServe has already been installed in your cluster . kubectl kubectl apply -f sklearn.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/sklearn-v2-iris/infer Expected Output { \"id\" : \"823248cc-d770-4a51-9606-16803395569c\" , \"model_name\" : \"sklearn-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1 , 1 ], \"datatype\" : \"INT64\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f sklearn-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"sklearn-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"sklearn-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/spark/","text":"Deploy Spark MLlib model with PMML InferenceService \u00b6 Setup \u00b6 Install pyspark 3.0.x and pyspark2pmml pip install pyspark~ = 3 .0.0 pip install pyspark2pmml Get JPMML-SparkML jar Train a Spark MLlib model and export to PMML file \u00b6 Launch pyspark with --jars to specify the location of the JPMML-SparkML uber-JAR pyspark --jars ./jpmml-sparkml-executable-1.6.3.jar Fitting a Spark ML pipeline: from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import RFormula df = spark . read . csv ( \"Iris.csv\" , header = True , inferSchema = True ) formula = RFormula ( formula = \"Species ~ .\" ) classifier = DecisionTreeClassifier () pipeline = Pipeline ( stages = [ formula , classifier ]) pipelineModel = pipeline . fit ( df ) from pyspark2pmml import PMMLBuilder pmmlBuilder = PMMLBuilder ( sc , df , pipelineModel ) pmmlBuilder . buildFile ( \"DecisionTreeIris.pmml\" ) Upload the DecisionTreeIris.pmml to a GCS bucket. gsutil cp ./DecisionTreeIris.pmml gs:// $BUCKET_NAME /sparkpmml/model.pmml Test the Model locally \u00b6 For testing the model locally, please refer the pmml server documentation . Deploy Spark MLlib model with V1 protocol \u00b6 Create the InferenceService with PMMLServer \u00b6 Create the InferenceService with pmml predictor and specify the storageUri with bucket location you uploaded to New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : model : modelFormat : name : pmml storageUri : gs://kfserving-examples/models/sparkpmml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/sparkpmml Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService custom resource kubectl apply -f spark_pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/spark-pmml created Wait the InferenceService to be ready kubectl wait --for = condition = Ready inferenceservice spark-pmml $ inferenceservice.serving.kserve.io/spark-pmml condition met Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = spark-pmml INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-pmml -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to spark-pmml.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/spark-pmml:predict HTTP/1.1 > Host: spark-pmml.default.35.237.217.209.xip.io > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 07 Mar 2021 19 :32:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 14 < * Connection #0 to host spark-pmml.default.35.237.217.209.xip.io left intact { \"predictions\" : [[ 1 .0, 0 .0, 1 .0, 0 .0 ]]} Deploy the model with Open Inference Protocol \u00b6 Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f spark-v2-iris.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/spark-v2-iris/infer Expected Output { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f spark-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"spark-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Spark MLlib"},{"location":"modelserving/v1beta1/spark/#deploy-spark-mllib-model-with-pmml-inferenceservice","text":"","title":"Deploy Spark MLlib model with PMML InferenceService"},{"location":"modelserving/v1beta1/spark/#setup","text":"Install pyspark 3.0.x and pyspark2pmml pip install pyspark~ = 3 .0.0 pip install pyspark2pmml Get JPMML-SparkML jar","title":"Setup"},{"location":"modelserving/v1beta1/spark/#train-a-spark-mllib-model-and-export-to-pmml-file","text":"Launch pyspark with --jars to specify the location of the JPMML-SparkML uber-JAR pyspark --jars ./jpmml-sparkml-executable-1.6.3.jar Fitting a Spark ML pipeline: from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import RFormula df = spark . read . csv ( \"Iris.csv\" , header = True , inferSchema = True ) formula = RFormula ( formula = \"Species ~ .\" ) classifier = DecisionTreeClassifier () pipeline = Pipeline ( stages = [ formula , classifier ]) pipelineModel = pipeline . fit ( df ) from pyspark2pmml import PMMLBuilder pmmlBuilder = PMMLBuilder ( sc , df , pipelineModel ) pmmlBuilder . buildFile ( \"DecisionTreeIris.pmml\" ) Upload the DecisionTreeIris.pmml to a GCS bucket. gsutil cp ./DecisionTreeIris.pmml gs:// $BUCKET_NAME /sparkpmml/model.pmml","title":"Train a Spark MLlib model and export to PMML file"},{"location":"modelserving/v1beta1/spark/#test-the-model-locally","text":"For testing the model locally, please refer the pmml server documentation .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/spark/#deploy-spark-mllib-model-with-v1-protocol","text":"","title":"Deploy Spark MLlib model with V1 protocol"},{"location":"modelserving/v1beta1/spark/#create-the-inferenceservice-with-pmmlserver","text":"Create the InferenceService with pmml predictor and specify the storageUri with bucket location you uploaded to New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : model : modelFormat : name : pmml storageUri : gs://kfserving-examples/models/sparkpmml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/sparkpmml Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService custom resource kubectl apply -f spark_pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/spark-pmml created Wait the InferenceService to be ready kubectl wait --for = condition = Ready inferenceservice spark-pmml $ inferenceservice.serving.kserve.io/spark-pmml condition met","title":"Create the InferenceService with PMMLServer"},{"location":"modelserving/v1beta1/spark/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = spark-pmml INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-pmml -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to spark-pmml.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/spark-pmml:predict HTTP/1.1 > Host: spark-pmml.default.35.237.217.209.xip.io > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 07 Mar 2021 19 :32:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 14 < * Connection #0 to host spark-pmml.default.35.237.217.209.xip.io left intact { \"predictions\" : [[ 1 .0, 0 .0, 1 .0, 0 .0 ]]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f spark-v2-iris.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/spark/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/spark-v2-iris/infer Expected Output { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f spark-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/spark/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"spark-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/tensorflow/","text":"Deploy Tensorflow Model with InferenceService \u00b6 Create the HTTP InferenceService \u00b6 Create an InferenceService yaml which specifies the framework tensorflow and storageUri that is pointed to a saved tensorflow model , and name it as tensorflow.yaml . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the tensorflow.yaml to create the InferenceService , by default it exposes a HTTP/REST endpoint. kubectl kubectl apply -f tensorflow.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-sample created Wait for the InferenceService to be in ready state kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 100 flower-sample-predictor-default-n9zs6 7m15s Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the inference request input file can be downloaded here . MODEL_NAME = flower-sample INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/tensorflow-sample:predict HTTP/1.1 > Host: tensorflow-sample.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 16201 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 16201 out of 16201 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json < date: Sun, 31 Jan 2021 01 :01:50 GMT < x-envoy-upstream-service-time: 280 < server: istio-envoy < { \"predictions\" : [ { \"scores\" : [ 0 .999114931, 9 .20987877e-05, 0 .000136786213, 0 .000337257545, 0 .000300532585, 1 .84813616e-05 ] , \"prediction\" : 0 , \"key\" : \" 1\" } ] } Canary Rollout \u00b6 Canary rollout is a great way to control the risk of rolling out a new model by first moving a small percent of the traffic to it and then gradually increase the percentage. To run a canary rollout, you can apply the canary.yaml with the canaryTrafficPercent field specified. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" Apply the canary.yaml to create the Canary InferenceService. kubectl kubectl apply -f canary.yaml To verify if the traffic split percentage is applied correctly, you can run the following command: kubectl kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 80 20 flower-sample-predictor-default-n9zs6 flower-sample-predictor-default-2kwtr 7m15s As you can see the traffic is split between the last rolled out revision and the current latest ready revision, KServe automatically tracks the last rolled out(stable) revision for you so you do not need to maintain both default and canary on the InferenceService as in v1alpha2. Create the gRPC InferenceService \u00b6 Create InferenceService which exposes the gRPC port and by default it listens on port 9000. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP Apply grpc.yaml to create the gRPC InferenceService. kubectl kubectl apply -f grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-grpc created Run a prediction \u00b6 We use a python gRPC client for the prediction, so you need to create a python virtual environment and install the tensorflow-serving-api . # The prediction script is written in TensorFlow 1.x pip install tensorflow-serving-api> = 1 .14.0,< 2 .0.0 Run the gRPC prediction script . MODEL_NAME = flower-grpc INPUT_PATH = ./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python grpc_client.py --host $INGRESS_HOST --port $INGRESS_PORT --model $MODEL_NAME --hostname $SERVICE_HOSTNAME --input_path $INPUT_PATH Expected Output ou t pu ts { key : \"key\" value { d t ype : DT_STRING tens or_shape { dim { size : 1 } } s tr i n g_val : \" 1\" } } ou t pu ts { key : \"prediction\" value { d t ype : DT_INT 64 tens or_shape { dim { size : 1 } } i nt 64 _val : 0 } } ou t pu ts { key : \"scores\" value { d t ype : DT_FLOAT tens or_shape { dim { size : 1 } dim { size : 6 } } fl oa t _val : 0.9991149306297302 fl oa t _val : 9.209887502947822e-05 fl oa t _val : 0.00013678647519554943 fl oa t _val : 0.0003372581850271672 fl oa t _val : 0.0003005331673193723 fl oa t _val : 1.848137799242977e-05 } } model_spec { na me : \"flowers-sample\" versio n { value : 1 } sig nature _ na me : \"serving_default\" }","title":"Tensorflow"},{"location":"modelserving/v1beta1/tensorflow/#deploy-tensorflow-model-with-inferenceservice","text":"","title":"Deploy Tensorflow Model with InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#create-the-http-inferenceservice","text":"Create an InferenceService yaml which specifies the framework tensorflow and storageUri that is pointed to a saved tensorflow model , and name it as tensorflow.yaml . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the tensorflow.yaml to create the InferenceService , by default it exposes a HTTP/REST endpoint. kubectl kubectl apply -f tensorflow.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-sample created Wait for the InferenceService to be in ready state kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 100 flower-sample-predictor-default-n9zs6 7m15s","title":"Create the HTTP InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the inference request input file can be downloaded here . MODEL_NAME = flower-sample INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/tensorflow-sample:predict HTTP/1.1 > Host: tensorflow-sample.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 16201 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 16201 out of 16201 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json < date: Sun, 31 Jan 2021 01 :01:50 GMT < x-envoy-upstream-service-time: 280 < server: istio-envoy < { \"predictions\" : [ { \"scores\" : [ 0 .999114931, 9 .20987877e-05, 0 .000136786213, 0 .000337257545, 0 .000300532585, 1 .84813616e-05 ] , \"prediction\" : 0 , \"key\" : \" 1\" } ] }","title":"Run a prediction"},{"location":"modelserving/v1beta1/tensorflow/#canary-rollout","text":"Canary rollout is a great way to control the risk of rolling out a new model by first moving a small percent of the traffic to it and then gradually increase the percentage. To run a canary rollout, you can apply the canary.yaml with the canaryTrafficPercent field specified. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" Apply the canary.yaml to create the Canary InferenceService. kubectl kubectl apply -f canary.yaml To verify if the traffic split percentage is applied correctly, you can run the following command: kubectl kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 80 20 flower-sample-predictor-default-n9zs6 flower-sample-predictor-default-2kwtr 7m15s As you can see the traffic is split between the last rolled out revision and the current latest ready revision, KServe automatically tracks the last rolled out(stable) revision for you so you do not need to maintain both default and canary on the InferenceService as in v1alpha2.","title":"Canary Rollout"},{"location":"modelserving/v1beta1/tensorflow/#create-the-grpc-inferenceservice","text":"Create InferenceService which exposes the gRPC port and by default it listens on port 9000. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP Apply grpc.yaml to create the gRPC InferenceService. kubectl kubectl apply -f grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-grpc created","title":"Create the gRPC InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#run-a-prediction_1","text":"We use a python gRPC client for the prediction, so you need to create a python virtual environment and install the tensorflow-serving-api . # The prediction script is written in TensorFlow 1.x pip install tensorflow-serving-api> = 1 .14.0,< 2 .0.0 Run the gRPC prediction script . MODEL_NAME = flower-grpc INPUT_PATH = ./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python grpc_client.py --host $INGRESS_HOST --port $INGRESS_PORT --model $MODEL_NAME --hostname $SERVICE_HOSTNAME --input_path $INPUT_PATH Expected Output ou t pu ts { key : \"key\" value { d t ype : DT_STRING tens or_shape { dim { size : 1 } } s tr i n g_val : \" 1\" } } ou t pu ts { key : \"prediction\" value { d t ype : DT_INT 64 tens or_shape { dim { size : 1 } } i nt 64 _val : 0 } } ou t pu ts { key : \"scores\" value { d t ype : DT_FLOAT tens or_shape { dim { size : 1 } dim { size : 6 } } fl oa t _val : 0.9991149306297302 fl oa t _val : 9.209887502947822e-05 fl oa t _val : 0.00013678647519554943 fl oa t _val : 0.0003372581850271672 fl oa t _val : 0.0003005331673193723 fl oa t _val : 1.848137799242977e-05 } } model_spec { na me : \"flowers-sample\" versio n { value : 1 } sig nature _ na me : \"serving_default\" }","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/","text":"Deploy a PyTorch Model with TorchServe InferenceService \u00b6 In this example, we deploy a trained PyTorch MNIST model to predict handwritten digits by running an InferenceService with TorchServe runtime which is the default installed serving runtime for PyTorch models. Model interpretability is also an important aspect which helps to understand which of the input features were important for a particular classification. Captum is a model interpretability library. In this example, TorchServe explain endpoint is implemented with Captum's state-of-the-art algorithm, including integrated gradients to provide users with an easy way to understand which features are contributing to the model output. You can refer to the Captum Tutorial for more examples. Create Model Storage with a Model Archive File and Config \u00b6 The KServe/TorchServe integration expects following model store layout. \u251c\u2500\u2500 config \u2502 \u251c\u2500\u2500 config.properties \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161.mar \u2502 \u251c\u2500\u2500 mnist.mar TorchServe provides a utility to package all the model artifacts into a single TorchServe Model Archive File (MAR) . After model artifacts are packaged into a MAR file, you then upload to the model-store under the model storage path. You can store your model and dependent files on remote storage or local persistent volume. The MNIST model and dependent files can be obtained from here . Note For remote storage you can choose to start the example using the prebuilt MNIST MAR file stored on KServe example GCS bucket gs://kfserving-examples/models/torchserve/image_classifier , or generate the MAR file with torch-model-archiver and create the model store on remote storage according to the above layout. torch-model-archiver --model-name mnist --version 1 .0 \\ --model-file model-archiver/model-store/mnist/mnist.py \\ --serialized-file model-archiver/model-store/mnist/mnist_cnn.pt \\ --handler model-archiver/model-store/mnist/mnist_handler.py \\ For PVC user please refer to model archive file generation for auto generation of MAR files with the model and dependent files. TorchServe uses a config.properties file to store configuration. Please see here for more details with the properties supported by the configuration file. The following is a sample file for KServe: inference_address=http://0.0.0.0:8085 management_address=http://0.0.0.0:8085 metrics_address=http://0.0.0.0:8082 grpc_inference_port=7070 grpc_management_port=7071 enable_metrics_api=true metrics_format=prometheus number_of_netty_threads=4 job_queue_size=10 enable_envvars_config=true install_py_dep_per_model=true model_store=/mnt/models/model-store model_snapshot={\"name\":\"startup.cfg\",\"modelCount\":1,\"models\":{\"mnist\":{\"1.0\":{\"defaultVersion\":true,\"marName\":\"mnist.mar\",\"minWorkers\":1,\"maxWorkers\":5,\"batchSize\":1,\"maxBatchDelay\":10,\"responseTimeout\":120}}}} The KServe/TorchServe integration supports KServe v1/v2 REST protocol. In the config.properties , we need to turn on the flag enable_envvars_config to enable setting the KServe envelop using an environment variable. Warning The previous service_envelope property has been deprecated and in the config.properties file use the flag enable_envvars_config=true to enable setting the service envelope at runtime. The requests are converted from KServe inference request format to TorchServe request format and sent to the inference_address configured via local socket. Deploy PyTorch Model with V1 REST Protocol \u00b6 Create the TorchServe InferenceService \u00b6 KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 For deploying the model on CPU, apply the following torchserve.yaml to create the InferenceService . kubectl kubectl apply -f torchserve.yaml New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" For deploying the model on GPU, apply the gpu.yaml to create the GPU InferenceService . kubectl kubectl apply -f gpu.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use image converter to convert the images to base64 byte array, for other models please refer to input request . curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist HTTP/1.1 > Host: torchserve.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 167 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Tue, 27 Oct 2020 08 :26:19 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517 < x-envoy-upstream-service-time: 6 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]} Model Explanation \u00b6 To get model explanation: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/mnist:explain -d @./mnist.json Expected Output { \"explanations\" : [[[[ 0.0005394675730469475 , -0.0022280013123036043 , -0.003416480100841055 , -0.0051329881112415965 , -0.009973864160829985 , -0.004112560908882716 , -0.009223458030656112 , -0.0006676354577291628 , -0.005249806664413386 , -0.0009790519227372953 , -0.0026914653993121195 , -0.0069470097151383995 , -0.00693530415962956 , -0.005973878697847718 , -0.00425042437288857 , 0.0032867281838150977 , -0.004297780258633562 , -0.005643196661192014 , -0.00653025019738562 , -0.0047062916121001185 , -0.0018656628277792628 , -0.0016757477204072532 , -0.0010410417081844845 , -0.0019093520822156726 , -0.004451403461006374 , -0.0008552767257773671 , -0.0027638888169885267 , -0.0 ], [ 0.006971297052106784 , 0.007316855222185687 , 0.012144494329150574 , 0.011477799383288441 , 0.006846725347670252 , 0.01149386176451476 , 0.0045351987881190655 , 0.007038361889638708 , 0.0035855377023272157 , 0.003031419502053957 , -0.0008611575226775316 , -0.0011085224745969223 , -0.0050840743637658534 , 0.009855491784340777 , 0.007220680811043034 , 0.011374285598070253 , 0.007147725481709019 , 0.0037114580912849457 , 0.00030763245479291384 , 0.0018305492665953394 , 0.010106224395114147 , 0.012932881164284687 , 0.008862892007714321 , 0.0070960526615982435 , -0.0015931137903787505 , 0.0036495747329455906 , 0.0002593849391051298 , -0.0 ], [ 0.006467265785857396 , -0.00041793201228071674 , 0.004900316089756856 , 0.002308395474823997 , 0.007859295399592283 , 0.003916404948969494 , 0.005630750246437249 , 0.0043712538044184375 , 0.006128530599133763 , -0.009446321309831246 , -0.014173645867037036 , -0.0062988650915794565 , -0.011473838941118539 , -0.009049151947644047 , -0.0007625645864610934 , -0.013721416630061238 , -0.0005580156670410108 , 0.0033404383756480784 , -0.006693278798487951 , -0.003705084551144756 , 0.005100375089529131 , 5.5276874714401074e-05 , 0.007221745280359063 , -0.00573598303916232 , -0.006836169033785967 , 0.0025401608627538936 , 9.303533912921196e-05 , -0.0 ], [ 0.005914399808621816 , 0.00452643561023696 , 0.003968242261515448 , 0.010422786058967673 , 0.007728358107899074 , 0.01147115923288383 , 0.005683869479056691 , 0.011150670502307374 , 0.008742555292485278 , 0.0032882897575743754 , 0.014841138421861584 , 0.011741228362482451 , 0.0004296862879259221 , -0.0035118140680654854 , -0.006152254410078331 , -0.004925121936901983 , -2.3611205202801947e-06 , 0.029347073037039074 , 0.02901626308947743 , 0.023379353021343398 , 0.004027157620197582 , -0.01677662249919171 , -0.013497255736128979 , 0.006957482854214602 , 0.0018321766800746145 , 0.008277034396684563 , 0.002733405455464871 , -0.0 ], [ 0.0049579739156640065 , -0.002168016158233997 , 0.0020644317321723642 , 0.0020912464240293825 , 0.004719691119907336 , 0.007879231202446626 , 0.010594445898145937 , 0.006533067778982801 , 0.002290214592708113 , -0.0036651114968251986 , 0.010753227423379443 , 0.006402706020466243 , -0.047075193909339695 , -0.08108259303568185 , -0.07646875196692542 , -0.1681834845371156 , -0.1610307396135756 , -0.12010309927453829 , -0.016148831320070896 , -0.009541525999486027 , 0.04575604594761406 , 0.031470966329886635 , 0.02452149438024385 , 0.016594078577569567 , 0.012213591301610382 , -0.002230875840404426 , 0.0036704051254298374 , -0.0 ], [ 0.006410107592414739 , 0.005578283890924384 , 0.001977103461731095 , 0.008935476507124939 , 0.0011305055729953436 , 0.0004946313900665659 , -0.0040266029554395935 , -0.004270765544167256 , -0.010832150944943138 , -0.01653511868336456 , -0.011121302103373972 , -0.42038514526905024 , -0.22874576003118394 , -0.16752936178907055 , -0.17021699697722079 , -0.09998584936787697 , -0.09041117495322142 , -0.10230248444795721 , -0.15260897522094888 , 0.07770835838531896 , -0.0813761125123066 , 0.027556910053932963 , 0.036305965104261866 , 0.03407793793894619 , 0.01212761779302579 , 0.006695133380685627 , 0.005331392748588556 , -0.0 ], [ 0.008342680065996267 , -0.00029249776150416367 , 0.002782130291086583 , 0.0027793744856745373 , 0.0020525102690845407 , 0.003679269934110004 , 0.009373846012918791 , -0.0031751745946300403 , -0.009042846256743316 , 0.0074141593032070775 , -0.02796812516561052 , -0.593171583786029 , -0.4830164472795136 , -0.353860128479443 , -0.256482708704862 , 0.11515586314578445 , 0.12700563162828346 , 0.0022342450630152204 , -0.24673707669992118 , -0.012878340813781437 , 0.16866821780196756 , 0.009739033161051434 , -0.000827843726513152 , -0.0002137320694585577 , -0.004179480126338929 , 0.008454049232317358 , -0.002767934266266998 , -0.0 ], [ 0.007070382982749552 , 0.005342127805750565 , -0.000983984198542354 , 0.007910101170274493 , 0.001266267696096404 , 0.0038575136843053844 , 0.006941130321773131 , -0.015195182020687892 , -0.016954974010578504 , -0.031186444096787943 , -0.031754626467747966 , 0.038918845112017694 , 0.06248943950328597 , 0.07703301092601872 , 0.0438493628024275 , -0.0482404449771698 , -0.08718650815999045 , -0.0014764704694506415 , -0.07426336448916614 , -0.10378029666564882 , 0.008572087846793842 , -0.00017173413848283343 , 0.010058893270893113 , 0.0028410498666004377 , 0.002008290211806285 , 0.011905375389931099 , 0.006071375802943992 , -0.0 ], [ 0.0076080165949142685 , -0.0017127333725310495 , 0.00153128150106188 , 0.0033391793764531563 , 0.005373442509691564 , 0.007207746020295443 , 0.007422946703693544 , -0.00699779191449194 , 0.002395328253696969 , -0.011682618874195954 , -0.012737004464649057 , -0.05379966383523857 , -0.07174960461749053 , -0.03027341304050314 , 0.0019411862216381327 , -0.0205575129473766 , -0.04617091711614171 , -0.017655308106959804 , -0.009297162816368814 , -0.03358572117988279 , -0.1626068444778013 , -0.015874364762085157 , -0.0013736074085577258 , -0.014763439328689378 , 0.00631805792697278 , 0.0021769414283267273 , 0.0023061635006792498 , -0.0 ], [ 0.005569931813561535 , 0.004363218328087518 , 0.00025609463218383973 , 0.009577483244680675 , 0.007257755916229399 , 0.00976284778532342 , -0.006388840235419147 , -0.009017880790555707 , -0.015308709334434867 , -0.016743935775597355 , -0.04372596546189275 , -0.03523469356755156 , -0.017257810114846107 , 0.011960489902313411 , 0.01529079831828911 , -0.020076559119468443 , -0.042792547669901516 , -0.0029492027218867116 , -0.011109560582516062 , -0.12985858077848939 , -0.2262858575494602 , -0.003391725540087574 , -0.03063368684328981 , -0.01353486587575121 , 0.0011140822443932317 , 0.006583451102528798 , 0.005667533945285076 , -0.0 ], [ 0.004056272267155598 , -0.0006394041203204911 , 0.004664893926197093 , 0.010593032387298614 , 0.014750931538689989 , 0.015428721146282149 , 0.012167820222401367 , 0.017604752451202518 , 0.01038886849969188 , 0.020544326931163263 , -0.0004206566917812794 , -0.0037463581359232674 , -0.0024656693040735075 , 0.0026061897697624353 , -0.05186055271869177 , -0.09158655048397382 , 0.022976389912563913 , -0.19851635458461808 , -0.11801281807622972 , -0.29127727790584423 , -0.017138655663803876 , -0.04395515676468641 , -0.019241432506341576 , 0.0011342298743447392 , 0.0030625771422964584 , -0.0002867924892991192 , -0.0017908808807543712 , -0.0 ], [ 0.0030114260660488892 , 0.0020246448273580006 , -0.003293361220376816 , 0.0036965043883218584 , 0.00013185761728146236 , -0.004355610866966878 , -0.006432601921104354 , -0.004148701459814858 , 0.005974553907915845 , -0.0001399233607281906 , 0.010392944122965082 , 0.015693249298693028 , 0.0459528427528407 , -0.013921539948093455 , -0.06615556518538708 , 0.02921438991320325 , -0.16345220625101778 , -0.002130491295590408 , -0.11449749664916867 , -0.030980255589300607 , -0.04804122537359171 , -0.05144994776295644 , 0.005122827412776085 , 0.006464862173908011 , 0.008624278272940246 , 0.0037316228508156427 , 0.0036947794337026706 , -0.0 ], [ 0.0038173843228389405 , -0.0017091931226819494 , -0.0030871869816778068 , 0.002115642501535999 , -0.006926441921580917 , -0.003023077828426468 , -0.014451359520861637 , -0.0020793048380231397 , -0.010948003939342523 , -0.0014460716966395166 , -0.01656990336897737 , 0.003052317148320358 , -0.0026729564809943513 , -0.06360067057346147 , 0.07780985635080599 , -0.1436689936630281 , -0.040817177623437874 , -0.04373367754296477 , -0.18337299150349698 , 0.025295182977407064 , -0.03874921104331938 , -0.002353901742617205 , 0.011772560401335033 , 0.012480994515707569 , 0.006498422579824301 , 0.00632320984076023 , 0.003407169765754805 , -0.0 ], [ 0.00944355257990139 , 0.009242583578688485 , 0.005069860444386138 , 0.012666191449103024 , 0.00941789912565746 , 0.004720427012836104 , 0.007597687789204113 , 0.008679266528089945 , 0.00889322771021875 , -0.0008577904940828809 , 0.0022973860384607604 , 0.025328230809207493 , -0.09908781123080951 , -0.07836626399832172 , -0.1546141264726177 , -0.2582207272050766 , -0.2297524599578219 , -0.29561835103416967 , 0.12048787956671528 , -0.06279365699861471 , -0.03832012404275233 , 0.022910264999199934 , 0.005803508497672737 , -0.003858461926053348 , 0.0039451232171312765 , 0.003858476747495933 , 0.0013034515558609956 , -0.0 ], [ 0.009725756015628606 , -0.0004001101998876524 , 0.006490722835571152 , 0.00800808023631959 , 0.0065880711806331265 , -0.0010264326176194034 , -0.0018914305972878344 , -0.008822522194658438 , -0.016650520788128117 , -0.03254382594389507 , -0.014795713101569494 , -0.05826499837818885 , -0.05165369567511702 , -0.13384277337594377 , -0.22572641373340493 , -0.21584739544668635 , -0.2366836351939208 , 0.14937824076489659 , -0.08127414932170171 , -0.06720440139736879 , -0.0038552732903526744 , 0.0107597891707803 , -5.67453590118174e-05 , 0.0020161340511396244 , -0.000783322694907436 , -0.0006397207517995289 , -0.005291639205010064 , -0.0 ], [ 0.008627543242777584 , 0.007700097300051849 , 0.0020430960246806138 , 0.012949015733198586 , 0.008428709579953574 , 0.001358177022953576 , 0.00421863939925833 , 0.002657580000868709 , -0.007339431957237175 , 0.02008439775442315 , -0.0033717631758033114 , -0.05176633249899187 , -0.013790328758662772 , -0.39102366157050594 , -0.167341447585844 , -0.04813367828213947 , 0.1367781582239039 , -0.04672809260566293 , -0.03237784669978756 , 0.03218068777925178 , 0.02415063765016493 , -0.017849899351200002 , -0.002975675228088795 , -0.004819438014786686 , 0.005106898651831245 , 0.0024278620704227456 , 6.784303333368138e-05 , -0.0 ], [ 0.009644258527009343 , -0.001331907219439711 , -0.0014639718434477777 , 0.008481926798958248 , 0.010278031715467508 , 0.003625808326891529 , -0.01121188617599796 , -0.0010634587872994379 , -0.0002603820881968461 , -0.017985648016990465 , -0.06446652745470374 , 0.07726063173046191 , -0.24739929795334742 , -0.2701855018480216 , -0.08888614776216278 , 0.1373325760136816 , -0.02316068912438066 , -0.042164834956711514 , 0.0009266091344106458 , 0.03141872420427644 , 0.011587728430225652 , 0.0004755143243520787 , 0.005860642609620605 , 0.008979633931394438 , 0.005061734169974005 , 0.003932710387086098 , 0.0015489986106803626 , -0.0 ], [ 0.010998736164377534 , 0.009378969800902604 , 0.00030577045264713074 , 0.0159329353530375 , 0.014849508018911006 , -0.0026513365659554225 , 0.002923303082126996 , 0.01917908707828847 , -0.02338288107991566 , -0.05706674679291175 , 0.009526265752669624 , -0.19945255386401284 , -0.10725519695909647 , -0.3222906835083537 , -0.03857038318412844 , -0.013279804965996065 , -0.046626023244262085 , -0.029299060237210447 , -0.043269580558906555 , -0.03768510002290657 , -0.02255977771908117 , -0.02632588166863199 , -0.014417349488098566 , -0.003077271951572957 , -0.0004973277708010661 , 0.0003475839139671271 , -0.0014522783025903258 , -0.0 ], [ 0.012215315671616316 , -0.001693194176229889 , 0.011365785434529038 , 0.0036964574178487792 , -0.010126738168635003 , -0.025554378647710443 , 0.006538003839811914 , -0.03181759044467965 , -0.016424751042854728 , 0.06177539736110035 , -0.43801735323216856 , -0.29991040815937386 , -0.2516019795363623 , 0.037789523540809 , -0.010948746374759491 , -0.0633901687126727 , -0.005976006160777705 , 0.006035133605976937 , -0.04961632526071937 , -0.04142116972831476 , -0.07558952727782252 , -0.04165176179187153 , -0.02021603856619006 , -0.0027365663096057032 , -0.011145473712733575 , 0.0003566937349350848 , -0.00546472985268321 , -0.0 ], [ 0.008009386447317503 , 0.006831207743885825 , 0.0051306149795546365 , 0.016239014770865052 , 0.020925441734273218 , 0.028344800173195076 , -0.004805080609285047 , -0.01880521614501033 , -0.1272329010865855 , -0.39835936819190537 , -0.09113694760349819 , -0.04061591094832608 , -0.12677021961235907 , 0.015567707226741051 , -0.005615051546243333 , -0.06454044862001587 , 0.0195457674752272 , -0.04219686517155871 , -0.08060569979524296 , 0.027234494361702787 , -0.009152881336047056 , -0.030865118003992217 , -0.005770311060090559 , 0.002905833371986098 , 5.606663556872091e-05 , 0.003209538083839772 , -0.0018588810743365345 , -0.0 ], [ 0.007587008852984699 , -0.0021213639853557625 , 0.0007709558092903736 , 0.013883256128746423 , 0.017328713012428214 , 0.03645357525636198 , -0.04043993335238427 , 0.05730125171252314 , -0.2563293727512057 , -0.11438826083879326 , 0.02662382809034687 , 0.03525271352483709 , 0.04745678120172762 , 0.0336360484090392 , -0.002916635707204059 , -0.17950855098650784 , -0.44161773297052964 , -0.4512180227831197 , -0.4940283106297913 , -0.1970108671285798 , 0.04344323143078066 , -0.012005120444897523 , 0.00987576109166055 , -0.0018336757466252476 , 0.0004913959502151706 , -0.0005409724034216215 , -0.005039223900868212 , -0.0 ], [ 0.00637876531169957 , 0.005189469227685454 , 0.0007676355246000376 , 0.018378100865097655 , 0.015739815031394887 , -0.035524983116512455 , 0.03781006978038308 , 0.28859052096740495 , 0.0726464110153121 , -0.026768468497420147 , 0.06278766200288134 , 0.17897045813699355 , -0.13780371920803108 , -0.14176458123649577 , -0.1733103177731656 , -0.3106508869296763 , 0.04788355140275794 , 0.04235327890285105 , -0.031266625292514394 , -0.016263819217960652 , -0.031388328800811355 , -0.01791363975905968 , -0.012025067979443894 , 0.008335083985905805 , -0.0014386677797296231 , 0.0055376544652972854 , 0.002241522815466253 , -0.0 ], [ 0.007455256326741617 , -0.0009475207572210404 , 0.0020288385162615286 , 0.015399640135796092 , 0.021133843188103074 , -0.019846405097622234 , -0.003162485751163173 , -0.14199005055318842 , -0.044200898667146035 , -0.013395459413208084 , 0.11019680479230103 , -0.014057216041764874 , -0.12553853334447865 , -0.05992513534766256 , 0.06467942189539834 , 0.08866056095907732 , -0.1451321508061849 , -0.07382491447758655 , -0.046961739981080476 , 0.0008943713493160624 , 0.03231044103656507 , 0.00036034241706501196 , -0.011387669277619417 , -0.00014602449257226195 , -0.0021863729003374116 , 0.0018817840156005856 , 0.0037909804578166286 , -0.0 ], [ 0.006511855618626698 , 0.006236866054439829 , -0.001440571166157676 , 0.012795776609942026 , 0.011530545030403624 , 0.03495489377257363 , 0.04792403136095304 , 0.049378583599065225 , 0.03296101702085617 , -0.0005351385876652296 , 0.017744115897640366 , 0.0011656622496764954 , 0.0232845869823761 , -0.0561191397060232 , -0.02854070511118366 , -0.028614174047247348 , -0.007763531086362863 , 0.01823079560098924 , 0.021961392405283622 , -0.009666681805706179 , 0.009547046884328725 , -0.008729943263791338 , 0.006408909680578429 , 0.009794327096359952 , -0.0025825219195515304 , 0.007063559189211571 , 0.007867244119267047 , -0.0 ], [ 0.007936663546039311 , -0.00010710180170593153 , 0.002716512705673228 , 0.0038633557307721487 , -0.0014877316616940372 , -0.0004788143065635909 , 0.012508842248031202 , 0.0045381104608414645 , -0.010650910516128294 , -0.013785341529644855 , -0.034287643221318206 , -0.022152707546335495 , -0.047056481347685974 , -0.032166744564720455 , -0.021551611335278546 , -0.002174962503376043 , 0.024344287130424306 , 0.015579272560525105 , 0.010958169741952194 , -0.010607232913436921 , -0.005548369726118836 , -0.0014630046444242706 , 0.013144180105016433 , 0.0031349366359021916 , 0.0010984887428255974 , 0.005426941473328394 , 0.006566511860044785 , -0.0 ], [ 0.0005529184874606495 , 0.00026139355020588705 , -0.002887623443531047 , 0.0013988462990850632 , 0.00203365139495493 , -0.007276926701775218 , -0.004010419939595932 , 0.017521952161185662 , 0.0006996977433557911 , 0.02083134683611201 , 0.013690533534289498 , -0.005466724359976675 , -0.008857712321334327 , 0.017408578822635818 , 0.0076439343049154425 , 0.0017861314923539985 , 0.007465865707523924 , 0.008034420825988495 , 0.003976298558337994 , 0.00411970637898539 , -0.004572592545819698 , 0.0029563907011979935 , -0.0006382227820088148 , 0.0015153753877889707 , -0.0052626601797995595 , 0.0025664706985019416 , 0.005161751034260073 , -0.0 ], [ 0.0009424280561998445 , -0.0012942360298110595 , 0.0011900868416523343 , 0.000984424113178899 , 0.0020988269382781564 , -0.005870080062890889 , -0.004950484744457169 , 0.003117643454332697 , -0.002509563565777083 , 0.005831604884101081 , 0.009531085216183116 , 0.010030206821909806 , 0.005858190171099734 , 4.9344529936340524e-05 , -0.004027895832421331 , 0.0025436439920587606 , 0.00531153867563076 , 0.00495942692369508 , 0.009215148318606382 , 0.00010011928317543458 , 0.0060051362999805355 , -0.0008195376963202741 , 0.0041728603512658224 , -0.0017597169567888774 , -0.0010577007775543158 , 0.00046033327178068433 , -0.0007674196306044449 , -0.0 ], [ -0.0 , -0.0 , 0.0013386963856532302 , 0.00035183178922260837 , 0.0030610334903526204 , 8.951834979315781e-05 , 0.0023676793550483524 , -0.0002900551076915047 , -0.00207019445286608 , -7.61697478482574e-05 , 0.0012150086715244216 , 0.009831239281792168 , 0.003479667642621962 , 0.0070584324334114525 , 0.004161851261339585 , 0.0026146296354490665 , -9.194746959222099e-05 , 0.0013583866966571571 , 0.0016821551239318913 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 ]]]]} Deploy PyTorch Model with V1 gRPC Protocol \u00b6 Note : Since kserve has no grpc client methods for v1, we are using torchserve's grpc v1 client Create the InferenceService \u00b6 For deploying the InferenceService with gRPC protocol you need to expose the gRPC port on InferenceService. Here 7070 is torchserve gRPC port. Apply the following mnist_grpc.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc.yaml Expected Output inferenceservice.serving.kserve.io/torchserve-grpc created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP Run Inference with TorchServe gRPC protocol \u00b6 Install gRPC python dependencies: pip install -U grpcio protobuf grpcio-tools Download TorchServe's inference and management proto: mkdir -p proto/v1 INFERENCE_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/inference.proto MANAGEMENT_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/management.proto curl -s -L ${ INFERENCE_PROTO_FILE_PATH } > ./proto/v1/inference.proto curl -s -L ${ MANAGEMENT_PROTO_FILE_PATH } > ./proto/v1/management.proto Generate python gRPC client stub using the proto files: python -m grpc_tools.protoc --proto_path = proto/v1/ --python_out = . --grpc_python_out = . proto/v1/inference.proto proto/v1/management.proto You can use image converter to convert the images to base64 byte array, for other models please refer to input request . Run gRPC Inference using torchserve_grpc_client.py with mnist.json as an example prediction input. MODEL_NAME = mnist INPUT_PATH = mnist.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python torchserve_grpc_client.py --api_name infer --model $MODEL_NAME --input_path $INPUT_PATH --hostname $SERVICE_HOSTNAME Expected Output { \"predictions\" : [ 2 ] } Deploy PyTorch model with Open Inference REST Protocol \u00b6 Create the InferenceService \u00b6 KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec and enables the KServe v1 inference protocol. To enable v2 open inference protocol, specify the protocolVersion field with the value v2 . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : pytorch : protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 For deploying the model on CPU, apply the mnist_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_v2.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve-mnist-v2 created Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-mnist-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can send both byte array and tensor with v2 protocol, for byte array use image converter to convert the image to byte array input. Here we use the mnist_v2_bytes.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2_bytes.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} For tensor input use the tensor image converter to convert the image to tensor input and here we use the mnist_v2.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2.json Expected Output { \"id\" : \"2266ec1e-f600-40af-97b5-7429b8195a80\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} Model Explanation \u00b6 To get the model explanation with v2 explain endpoint: MODEL_NAME = mnist curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mnist/explain -d @./mnist_v2.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"explain\" , \"shape\" : [ 1 , 28 , 28 ], \"datatype\" : \"FP64\" , \"data\" : [ -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0040547528781588035 , -0.00022612877200043775 , -0.0001273413606783097 , 0.005648369508785856 , 0.008904784451506994 , 0.0026385365879584796 , 0.0026802458602499875 , -0.002657801604900743 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.00024465772895309256 , 0.0008218449738666515 , 0.015285917610467934 , 0.007512832227517626 , 0.007094984753782517 , 0.003405668751094489 , -0.0020919252360163056 , -0.00078002938659872 , 0.02299587777864007 , 0.01900432942654754 , -0.001252955497754338 , -0.0014666116894338772 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.005298396384926053 , -0.0007901605067151054 , 0.0039060659788228954 , 0.02317408211645009 , 0.017237917554858186 , 0.010867034286601965 , 0.003001563092717309 , 0.00622421762838887 , 0.006120712336480808 , 0.016736329175541464 , 0.005674718838256385 , 0.004344134814439431 , -0.001232842177319105 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0006867353660007012 , 0.00977289933298656 , -0.003875493166540815 , 0.0017986937404117591 , 0.0013075440157543057 , -0.0024510980461748236 , -0.0008806773426546923 , -0.0 , -0.0 , -0.00014277890422995419 , -0.009322313284511257 , 0.020608317953885236 , 0.004351394739722548 , -0.0007875565409186222 , -0.0009075897751127677 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.00022247237111456804 , -0.0007829031603535926 , 0.002666369539125161 , 0.000973336852105775 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.000432321003928822 , 0.023657172129172684 , 0.010694844898905204 , -0.002375952975746018 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0020747972047037 , -0.002320101258915877 , -0.0012899205783904548 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.007629679655402933 , 0.01044862724376463 , 0.00025032878924736025 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.00037708370104137974 , -0.005156369275302328 , 0.0012477582442296628 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -4.442516083381132e-05 , 0.01024804634283815 , 0.0009971135240970147 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0004501048968956462 , -0.0019630535686311007 , -0.0006664793297549408 , 0.0020157403539278907 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0022144569383238466 , 0.008361583574785395 , 0.00314019428604999 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0028943544591141838 , -0.0031301383432286406 , 0.002113252872926688 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0010321050605717045 , 0.008905753926369048 , 0.002846438277738756 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.005305288883499087 , -0.00192711009725932 , 0.0012090042768467344 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0011945156500241256 , 0.005654442715832439 , 0.0020132075345016807 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0014689356969061985 , 0.0010743412638183228 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0017047980586912376 , 0.00290660517425009 , -0.0007805869640505143 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 5.541 * Co nne c t io n # 0 t o hos t localhos t le ft i nta c t 725422148614e-05 , 0.0014516114512869852 , 0.0002827701966546988 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0014401407633627265 , 0.0023812497776698745 , 0.002146825301700187 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0011500529125940918 , 0.0002865015572973405 , 0.0029798151042282686 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0017750295500283872 , 0.0008339859126060243 , -0.00377073933577687 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0006093176894575109 , -0.00046905787892409935 , 0.0034053218511795034 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0007450011768391558 , 0.001298767372877851 , -0.008499247640112315 , -6.145166131400234e-05 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0011809726042792137 , -0.001838476328106708 , 0.00541110661116898 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.002139234224224006 , 0.0003259163407641124 , -0.005276118873855287 , -0.001950984007438105 , -9.545670742026532e-07 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0007772404228681039 , -0.0001517956264720738 , 0.0064814848131711815 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 8.098064985902114e-05 , -0.00249042660692983 , -0.0020718619200672302 , -5.341117902942147e-05 , -0.00045564724429915073 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0022750983476959733 , 0.0017164060958460778 , 0.0003221344707738082 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0015560282678744543 , 9.107238495871273e-05 , 0.0008772841497928399 , 0.0006502978626355868 , -0.004128780767525651 , 0.0006030386900152659 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.001395995791096219 , 0.0026791526689584344 , 0.0023995008266391488 , -0.0004496096312746451 , 0.003101832450753724 , 0.007494536066960778 , 0.0028641187148287965 , -0.0030525907182629075 , 0.003420222396518567 , 0.0014924018363498125 , -0.0009357388301326025 , 0.0007856228933169799 , -0.0018433973914981437 , 1.6031856831240914e-05 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0006999018502034005 , 0.004382250870697946 , -0.0035419313267119365 , -0.0028896748092595375 , -0.00048734542493666705 , -0.0060873452419295 , 0.000388224990424471 , 0.002533641537585585 , -0.004352836563597573 , -0.0006079418766875505 , -0.0038101334053377753 , -0.000828441340357984 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0010901530866342661 , -0.013135008038845744 , 0.0004734518707654666 , 0.002050423283568135 , -0.006609451922460863 , 0.0023647861820124366 , 0.0046789204256194 , -0.0018122527412311837 , 0.002137538353955849 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ]}]} Deploy PyTorch Model with Open Inference gRPC Protocol \u00b6 Create the InferenceService \u00b6 For deploying the InferenceService with Open Inference gRPC Protocol you need to expose the gRPC port on InferenceService. Here 8081 is kserve gRPC port. Apply the following mnist_grpc_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc_v2.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-grpc-v2 created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : pytorch : protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP Run gRPC Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Then download Open Inference gRPC proto file: mkdir -p proto/v2 PROTO_FILE_PATH = https://raw.githubusercontent.com/kserve/kserve/master/python/kserve/kserve/protocol/grpc/grpc_predict_v2.proto curl -s -L ${ PROTO_FILE_PATH } > ./proto/v2/grpc_predict_v2.proto Run the inference test with grpcurl: INPUT_PATH = ./mnist_v2_grpc_tensor.json PROTO_FILE = proto/v2/grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) grpcurl -v -plaintext -proto ${ PROTO_FILE } -authority ${ SERVICE_HOSTNAME } -d @ ${ INGRESS_HOST } : ${ INGRESS_PORT } inference.GRPCInferenceService.ModelInfer <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Wed, 11 Oct 2023 13 :36:30 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 581 Response contents: { \"modelName\" : \"mnist\" , \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"outputs\" : [ { \"name\" : \"input-0\" , \"datatype\" : \"INT64\" , \"shape\" : [ \"1\" ] , \"contents\" : { \"int64Contents\" : [ \"1\" ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response Autoscaling \u00b6 One of the main serverless inference features is to automatically scale the replicas of an InferenceService matching the incoming workload. KServe by default enables Knative Pod Autoscaler which watches traffic flow and scales up and down based on the configured metrics. Knative Autoscaler \u00b6 KServe supports the implementation of Knative Pod Autoscaler (KPA) and Kubernetes\u2019 Horizontal Pod Autoscaler (HPA) . The features and limitations of each of these Autoscalers are listed below. Note If you want to use Kubernetes Horizontal Pod Autoscaler (HPA), you must install HPA extension Knative Pod Autoscaler (KPA) Part of the Knative Serving core and enabled by default once Knative Serving is installed. Supports scale to zero functionality. Does not support CPU-based autoscaling. Horizontal Pod Autoscaler (HPA) Not part of the Knative Serving core, and must be enabled after Knative Serving installation. Does not support scale to zero functionality. Supports CPU-based autoscaling. Create InferenceService with Concurrency Target \u00b6 Hard/Soft Autoscaling Limit \u00b6 You can configure InferenceService with annotation autoscaling.knative.dev/target for a soft limit. The soft limit is a targeted limit rather than a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" You can also configure InferenceService with field containerConcurrency with a hard limit. The hard limit is an enforced upper bound. If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" After specifying the soft or hard limits of the scaling target, you can now deploy the InferenceService with autoscaling.yaml . kubectl kubectl apply -f autoscaling.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created Run Inference with Concurrent Requests \u00b6 The first step is to install the hey load generator and then send the concurrent requests to the InferenceService . go get -u github.com/rakyll/hey MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -m POST -z 30s -D ./mnist.json -host ${ SERVICE_HOSTNAME } http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Check Pod Autoscaling \u00b6 hey by default generates 50 requests concurrently, so you can see that the InferenceService scales to 5 pods as the container concurrency target is set to 10. Expected Output NAME READY STATUS RESTARTS AGE torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb 2 /2 Terminating 0 103s torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8 2 /2 Terminating 0 95s torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq 2 /2 Running 0 50m torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2 /2 Running 0 113s torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2 /2 Running 0 109s torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t 2 /2 Terminating 0 103s Canary Rollout \u00b6 Canary rollout is a deployment strategy when you release a new version of model to a small percent of the production traffic. Create InferenceService with Canary Model \u00b6 After the above experiments, now let's see how you can rollout a new model without moving full traffic to the new model by default. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" In this example we change the storageUri to the v2 version with canaryTrafficPercent field and then apply the canary.yaml . kubectl kubectl apply -f canary.yaml Expected Output kubectl get revisions -l serving.kserve.io/inferenceservice = torchserve NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON ACTUAL REPLICAS DESIRED REPLICAS torchserve-predictor-default-00001 torchserve-predictor-default 1 True 1 1 torchserve-predictor-default-00002 torchserve-predictor-default 2 True 1 1 kubectl get pods -l serving.kserve.io/inferenceservice = torchserve NAME READY STATUS RESTARTS AGE torchserve-predictor-default-00001-deployment-7d99979c99-p49gk 2 /2 Running 0 28m torchserve-predictor-default-00002-deployment-c6fcc65dd-rjknq 2 /2 Running 0 3m37s Check Traffic Status \u00b6 After the canary model is rolled out, the traffic should be split between the canary model revision and the \"stable\" revision which was rolled out with 100% percent traffic, now check the traffic split from the InferenceService traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 20 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 80 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Traffic Rollout \u00b6 Run the following curl requests a few times to the InferenceService , you can see that requests are sent to the two revisions with 20/80 splits. MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) for i in { 1 ..10 } ; do curl -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json ; done Expected Output { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 <html><title>500: Internal Server Error</title><body>500: Internal Server Error</body></html>Handling connection for 8080 <html><title>500: Internal Server Error</title><body>500: Internal Server Error</body></html>Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 You can notice that when the request hits the canary revision it fails, this is because that the new revision requires the v2 inference input mnist_v2.json which is a breaking change, in addition the traffic is randomly splitted between the two revisions according to the specified traffic percentage. In this case you should rollout the canary model with 0 canaryTrafficPercent and use the latest tagged url to test the canary model before moving the full traffic to the new model. kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' curl -v -H \"Host: latest-torchserve-predictor-default.default.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [ 1 ], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} After the new model is tested and verified, you can now bump the canaryTrafficPercent to 100 to fully rollout the traffic to the new revision and now the latestRolledoutRevision becomes torchserve-predictor-default-00002 and previousRolledoutRevision becomes torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 100}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00002\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Rollback the Model \u00b6 In case the new model version does not work after the traffic is moved to the new revision, you can still patch the canaryTrafficPercent to 0 and move the traffic back to the previously rolled model which is torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 0 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Monitoring \u00b6 Metrics Exposure and Grafana Dashboard Setup","title":"PyTorch"},{"location":"modelserving/v1beta1/torchserve/#deploy-a-pytorch-model-with-torchserve-inferenceservice","text":"In this example, we deploy a trained PyTorch MNIST model to predict handwritten digits by running an InferenceService with TorchServe runtime which is the default installed serving runtime for PyTorch models. Model interpretability is also an important aspect which helps to understand which of the input features were important for a particular classification. Captum is a model interpretability library. In this example, TorchServe explain endpoint is implemented with Captum's state-of-the-art algorithm, including integrated gradients to provide users with an easy way to understand which features are contributing to the model output. You can refer to the Captum Tutorial for more examples.","title":"Deploy a PyTorch Model with TorchServe InferenceService"},{"location":"modelserving/v1beta1/torchserve/#create-model-storage-with-a-model-archive-file-and-config","text":"The KServe/TorchServe integration expects following model store layout. \u251c\u2500\u2500 config \u2502 \u251c\u2500\u2500 config.properties \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161.mar \u2502 \u251c\u2500\u2500 mnist.mar TorchServe provides a utility to package all the model artifacts into a single TorchServe Model Archive File (MAR) . After model artifacts are packaged into a MAR file, you then upload to the model-store under the model storage path. You can store your model and dependent files on remote storage or local persistent volume. The MNIST model and dependent files can be obtained from here . Note For remote storage you can choose to start the example using the prebuilt MNIST MAR file stored on KServe example GCS bucket gs://kfserving-examples/models/torchserve/image_classifier , or generate the MAR file with torch-model-archiver and create the model store on remote storage according to the above layout. torch-model-archiver --model-name mnist --version 1 .0 \\ --model-file model-archiver/model-store/mnist/mnist.py \\ --serialized-file model-archiver/model-store/mnist/mnist_cnn.pt \\ --handler model-archiver/model-store/mnist/mnist_handler.py \\ For PVC user please refer to model archive file generation for auto generation of MAR files with the model and dependent files. TorchServe uses a config.properties file to store configuration. Please see here for more details with the properties supported by the configuration file. The following is a sample file for KServe: inference_address=http://0.0.0.0:8085 management_address=http://0.0.0.0:8085 metrics_address=http://0.0.0.0:8082 grpc_inference_port=7070 grpc_management_port=7071 enable_metrics_api=true metrics_format=prometheus number_of_netty_threads=4 job_queue_size=10 enable_envvars_config=true install_py_dep_per_model=true model_store=/mnt/models/model-store model_snapshot={\"name\":\"startup.cfg\",\"modelCount\":1,\"models\":{\"mnist\":{\"1.0\":{\"defaultVersion\":true,\"marName\":\"mnist.mar\",\"minWorkers\":1,\"maxWorkers\":5,\"batchSize\":1,\"maxBatchDelay\":10,\"responseTimeout\":120}}}} The KServe/TorchServe integration supports KServe v1/v2 REST protocol. In the config.properties , we need to turn on the flag enable_envvars_config to enable setting the KServe envelop using an environment variable. Warning The previous service_envelope property has been deprecated and in the config.properties file use the flag enable_envvars_config=true to enable setting the service envelope at runtime. The requests are converted from KServe inference request format to TorchServe request format and sent to the inference_address configured via local socket.","title":"Create Model Storage with a Model Archive File and Config"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-v1-rest-protocol","text":"","title":"Deploy PyTorch Model with V1 REST Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-torchserve-inferenceservice","text":"KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 For deploying the model on CPU, apply the following torchserve.yaml to create the InferenceService . kubectl kubectl apply -f torchserve.yaml New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" For deploying the model on GPU, apply the gpu.yaml to create the GPU InferenceService . kubectl kubectl apply -f gpu.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created","title":"Create the TorchServe InferenceService"},{"location":"modelserving/v1beta1/torchserve/#model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use image converter to convert the images to base64 byte array, for other models please refer to input request . curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist HTTP/1.1 > Host: torchserve.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 167 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Tue, 27 Oct 2020 08 :26:19 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517 < x-envoy-upstream-service-time: 6 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]}","title":"Model Inference"},{"location":"modelserving/v1beta1/torchserve/#model-explanation","text":"To get model explanation: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/mnist:explain -d @./mnist.json Expected Output { \"explanations\" : [[[[ 0.0005394675730469475 , -0.0022280013123036043 , -0.003416480100841055 , -0.0051329881112415965 , -0.009973864160829985 , -0.004112560908882716 , -0.009223458030656112 , -0.0006676354577291628 , -0.005249806664413386 , -0.0009790519227372953 , -0.0026914653993121195 , -0.0069470097151383995 , -0.00693530415962956 , -0.005973878697847718 , -0.00425042437288857 , 0.0032867281838150977 , -0.004297780258633562 , -0.005643196661192014 , -0.00653025019738562 , -0.0047062916121001185 , -0.0018656628277792628 , -0.0016757477204072532 , -0.0010410417081844845 , -0.0019093520822156726 , -0.004451403461006374 , -0.0008552767257773671 , -0.0027638888169885267 , -0.0 ], [ 0.006971297052106784 , 0.007316855222185687 , 0.012144494329150574 , 0.011477799383288441 , 0.006846725347670252 , 0.01149386176451476 , 0.0045351987881190655 , 0.007038361889638708 , 0.0035855377023272157 , 0.003031419502053957 , -0.0008611575226775316 , -0.0011085224745969223 , -0.0050840743637658534 , 0.009855491784340777 , 0.007220680811043034 , 0.011374285598070253 , 0.007147725481709019 , 0.0037114580912849457 , 0.00030763245479291384 , 0.0018305492665953394 , 0.010106224395114147 , 0.012932881164284687 , 0.008862892007714321 , 0.0070960526615982435 , -0.0015931137903787505 , 0.0036495747329455906 , 0.0002593849391051298 , -0.0 ], [ 0.006467265785857396 , -0.00041793201228071674 , 0.004900316089756856 , 0.002308395474823997 , 0.007859295399592283 , 0.003916404948969494 , 0.005630750246437249 , 0.0043712538044184375 , 0.006128530599133763 , -0.009446321309831246 , -0.014173645867037036 , -0.0062988650915794565 , -0.011473838941118539 , -0.009049151947644047 , -0.0007625645864610934 , -0.013721416630061238 , -0.0005580156670410108 , 0.0033404383756480784 , -0.006693278798487951 , -0.003705084551144756 , 0.005100375089529131 , 5.5276874714401074e-05 , 0.007221745280359063 , -0.00573598303916232 , -0.006836169033785967 , 0.0025401608627538936 , 9.303533912921196e-05 , -0.0 ], [ 0.005914399808621816 , 0.00452643561023696 , 0.003968242261515448 , 0.010422786058967673 , 0.007728358107899074 , 0.01147115923288383 , 0.005683869479056691 , 0.011150670502307374 , 0.008742555292485278 , 0.0032882897575743754 , 0.014841138421861584 , 0.011741228362482451 , 0.0004296862879259221 , -0.0035118140680654854 , -0.006152254410078331 , -0.004925121936901983 , -2.3611205202801947e-06 , 0.029347073037039074 , 0.02901626308947743 , 0.023379353021343398 , 0.004027157620197582 , -0.01677662249919171 , -0.013497255736128979 , 0.006957482854214602 , 0.0018321766800746145 , 0.008277034396684563 , 0.002733405455464871 , -0.0 ], [ 0.0049579739156640065 , -0.002168016158233997 , 0.0020644317321723642 , 0.0020912464240293825 , 0.004719691119907336 , 0.007879231202446626 , 0.010594445898145937 , 0.006533067778982801 , 0.002290214592708113 , -0.0036651114968251986 , 0.010753227423379443 , 0.006402706020466243 , -0.047075193909339695 , -0.08108259303568185 , -0.07646875196692542 , -0.1681834845371156 , -0.1610307396135756 , -0.12010309927453829 , -0.016148831320070896 , -0.009541525999486027 , 0.04575604594761406 , 0.031470966329886635 , 0.02452149438024385 , 0.016594078577569567 , 0.012213591301610382 , -0.002230875840404426 , 0.0036704051254298374 , -0.0 ], [ 0.006410107592414739 , 0.005578283890924384 , 0.001977103461731095 , 0.008935476507124939 , 0.0011305055729953436 , 0.0004946313900665659 , -0.0040266029554395935 , -0.004270765544167256 , -0.010832150944943138 , -0.01653511868336456 , -0.011121302103373972 , -0.42038514526905024 , -0.22874576003118394 , -0.16752936178907055 , -0.17021699697722079 , -0.09998584936787697 , -0.09041117495322142 , -0.10230248444795721 , -0.15260897522094888 , 0.07770835838531896 , -0.0813761125123066 , 0.027556910053932963 , 0.036305965104261866 , 0.03407793793894619 , 0.01212761779302579 , 0.006695133380685627 , 0.005331392748588556 , -0.0 ], [ 0.008342680065996267 , -0.00029249776150416367 , 0.002782130291086583 , 0.0027793744856745373 , 0.0020525102690845407 , 0.003679269934110004 , 0.009373846012918791 , -0.0031751745946300403 , -0.009042846256743316 , 0.0074141593032070775 , -0.02796812516561052 , -0.593171583786029 , -0.4830164472795136 , -0.353860128479443 , -0.256482708704862 , 0.11515586314578445 , 0.12700563162828346 , 0.0022342450630152204 , -0.24673707669992118 , -0.012878340813781437 , 0.16866821780196756 , 0.009739033161051434 , -0.000827843726513152 , -0.0002137320694585577 , -0.004179480126338929 , 0.008454049232317358 , -0.002767934266266998 , -0.0 ], [ 0.007070382982749552 , 0.005342127805750565 , -0.000983984198542354 , 0.007910101170274493 , 0.001266267696096404 , 0.0038575136843053844 , 0.006941130321773131 , -0.015195182020687892 , -0.016954974010578504 , -0.031186444096787943 , -0.031754626467747966 , 0.038918845112017694 , 0.06248943950328597 , 0.07703301092601872 , 0.0438493628024275 , -0.0482404449771698 , -0.08718650815999045 , -0.0014764704694506415 , -0.07426336448916614 , -0.10378029666564882 , 0.008572087846793842 , -0.00017173413848283343 , 0.010058893270893113 , 0.0028410498666004377 , 0.002008290211806285 , 0.011905375389931099 , 0.006071375802943992 , -0.0 ], [ 0.0076080165949142685 , -0.0017127333725310495 , 0.00153128150106188 , 0.0033391793764531563 , 0.005373442509691564 , 0.007207746020295443 , 0.007422946703693544 , -0.00699779191449194 , 0.002395328253696969 , -0.011682618874195954 , -0.012737004464649057 , -0.05379966383523857 , -0.07174960461749053 , -0.03027341304050314 , 0.0019411862216381327 , -0.0205575129473766 , -0.04617091711614171 , -0.017655308106959804 , -0.009297162816368814 , -0.03358572117988279 , -0.1626068444778013 , -0.015874364762085157 , -0.0013736074085577258 , -0.014763439328689378 , 0.00631805792697278 , 0.0021769414283267273 , 0.0023061635006792498 , -0.0 ], [ 0.005569931813561535 , 0.004363218328087518 , 0.00025609463218383973 , 0.009577483244680675 , 0.007257755916229399 , 0.00976284778532342 , -0.006388840235419147 , -0.009017880790555707 , -0.015308709334434867 , -0.016743935775597355 , -0.04372596546189275 , -0.03523469356755156 , -0.017257810114846107 , 0.011960489902313411 , 0.01529079831828911 , -0.020076559119468443 , -0.042792547669901516 , -0.0029492027218867116 , -0.011109560582516062 , -0.12985858077848939 , -0.2262858575494602 , -0.003391725540087574 , -0.03063368684328981 , -0.01353486587575121 , 0.0011140822443932317 , 0.006583451102528798 , 0.005667533945285076 , -0.0 ], [ 0.004056272267155598 , -0.0006394041203204911 , 0.004664893926197093 , 0.010593032387298614 , 0.014750931538689989 , 0.015428721146282149 , 0.012167820222401367 , 0.017604752451202518 , 0.01038886849969188 , 0.020544326931163263 , -0.0004206566917812794 , -0.0037463581359232674 , -0.0024656693040735075 , 0.0026061897697624353 , -0.05186055271869177 , -0.09158655048397382 , 0.022976389912563913 , -0.19851635458461808 , -0.11801281807622972 , -0.29127727790584423 , -0.017138655663803876 , -0.04395515676468641 , -0.019241432506341576 , 0.0011342298743447392 , 0.0030625771422964584 , -0.0002867924892991192 , -0.0017908808807543712 , -0.0 ], [ 0.0030114260660488892 , 0.0020246448273580006 , -0.003293361220376816 , 0.0036965043883218584 , 0.00013185761728146236 , -0.004355610866966878 , -0.006432601921104354 , -0.004148701459814858 , 0.005974553907915845 , -0.0001399233607281906 , 0.010392944122965082 , 0.015693249298693028 , 0.0459528427528407 , -0.013921539948093455 , -0.06615556518538708 , 0.02921438991320325 , -0.16345220625101778 , -0.002130491295590408 , -0.11449749664916867 , -0.030980255589300607 , -0.04804122537359171 , -0.05144994776295644 , 0.005122827412776085 , 0.006464862173908011 , 0.008624278272940246 , 0.0037316228508156427 , 0.0036947794337026706 , -0.0 ], [ 0.0038173843228389405 , -0.0017091931226819494 , -0.0030871869816778068 , 0.002115642501535999 , -0.006926441921580917 , -0.003023077828426468 , -0.014451359520861637 , -0.0020793048380231397 , -0.010948003939342523 , -0.0014460716966395166 , -0.01656990336897737 , 0.003052317148320358 , -0.0026729564809943513 , -0.06360067057346147 , 0.07780985635080599 , -0.1436689936630281 , -0.040817177623437874 , -0.04373367754296477 , -0.18337299150349698 , 0.025295182977407064 , -0.03874921104331938 , -0.002353901742617205 , 0.011772560401335033 , 0.012480994515707569 , 0.006498422579824301 , 0.00632320984076023 , 0.003407169765754805 , -0.0 ], [ 0.00944355257990139 , 0.009242583578688485 , 0.005069860444386138 , 0.012666191449103024 , 0.00941789912565746 , 0.004720427012836104 , 0.007597687789204113 , 0.008679266528089945 , 0.00889322771021875 , -0.0008577904940828809 , 0.0022973860384607604 , 0.025328230809207493 , -0.09908781123080951 , -0.07836626399832172 , -0.1546141264726177 , -0.2582207272050766 , -0.2297524599578219 , -0.29561835103416967 , 0.12048787956671528 , -0.06279365699861471 , -0.03832012404275233 , 0.022910264999199934 , 0.005803508497672737 , -0.003858461926053348 , 0.0039451232171312765 , 0.003858476747495933 , 0.0013034515558609956 , -0.0 ], [ 0.009725756015628606 , -0.0004001101998876524 , 0.006490722835571152 , 0.00800808023631959 , 0.0065880711806331265 , -0.0010264326176194034 , -0.0018914305972878344 , -0.008822522194658438 , -0.016650520788128117 , -0.03254382594389507 , -0.014795713101569494 , -0.05826499837818885 , -0.05165369567511702 , -0.13384277337594377 , -0.22572641373340493 , -0.21584739544668635 , -0.2366836351939208 , 0.14937824076489659 , -0.08127414932170171 , -0.06720440139736879 , -0.0038552732903526744 , 0.0107597891707803 , -5.67453590118174e-05 , 0.0020161340511396244 , -0.000783322694907436 , -0.0006397207517995289 , -0.005291639205010064 , -0.0 ], [ 0.008627543242777584 , 0.007700097300051849 , 0.0020430960246806138 , 0.012949015733198586 , 0.008428709579953574 , 0.001358177022953576 , 0.00421863939925833 , 0.002657580000868709 , -0.007339431957237175 , 0.02008439775442315 , -0.0033717631758033114 , -0.05176633249899187 , -0.013790328758662772 , -0.39102366157050594 , -0.167341447585844 , -0.04813367828213947 , 0.1367781582239039 , -0.04672809260566293 , -0.03237784669978756 , 0.03218068777925178 , 0.02415063765016493 , -0.017849899351200002 , -0.002975675228088795 , -0.004819438014786686 , 0.005106898651831245 , 0.0024278620704227456 , 6.784303333368138e-05 , -0.0 ], [ 0.009644258527009343 , -0.001331907219439711 , -0.0014639718434477777 , 0.008481926798958248 , 0.010278031715467508 , 0.003625808326891529 , -0.01121188617599796 , -0.0010634587872994379 , -0.0002603820881968461 , -0.017985648016990465 , -0.06446652745470374 , 0.07726063173046191 , -0.24739929795334742 , -0.2701855018480216 , -0.08888614776216278 , 0.1373325760136816 , -0.02316068912438066 , -0.042164834956711514 , 0.0009266091344106458 , 0.03141872420427644 , 0.011587728430225652 , 0.0004755143243520787 , 0.005860642609620605 , 0.008979633931394438 , 0.005061734169974005 , 0.003932710387086098 , 0.0015489986106803626 , -0.0 ], [ 0.010998736164377534 , 0.009378969800902604 , 0.00030577045264713074 , 0.0159329353530375 , 0.014849508018911006 , -0.0026513365659554225 , 0.002923303082126996 , 0.01917908707828847 , -0.02338288107991566 , -0.05706674679291175 , 0.009526265752669624 , -0.19945255386401284 , -0.10725519695909647 , -0.3222906835083537 , -0.03857038318412844 , -0.013279804965996065 , -0.046626023244262085 , -0.029299060237210447 , -0.043269580558906555 , -0.03768510002290657 , -0.02255977771908117 , -0.02632588166863199 , -0.014417349488098566 , -0.003077271951572957 , -0.0004973277708010661 , 0.0003475839139671271 , -0.0014522783025903258 , -0.0 ], [ 0.012215315671616316 , -0.001693194176229889 , 0.011365785434529038 , 0.0036964574178487792 , -0.010126738168635003 , -0.025554378647710443 , 0.006538003839811914 , -0.03181759044467965 , -0.016424751042854728 , 0.06177539736110035 , -0.43801735323216856 , -0.29991040815937386 , -0.2516019795363623 , 0.037789523540809 , -0.010948746374759491 , -0.0633901687126727 , -0.005976006160777705 , 0.006035133605976937 , -0.04961632526071937 , -0.04142116972831476 , -0.07558952727782252 , -0.04165176179187153 , -0.02021603856619006 , -0.0027365663096057032 , -0.011145473712733575 , 0.0003566937349350848 , -0.00546472985268321 , -0.0 ], [ 0.008009386447317503 , 0.006831207743885825 , 0.0051306149795546365 , 0.016239014770865052 , 0.020925441734273218 , 0.028344800173195076 , -0.004805080609285047 , -0.01880521614501033 , -0.1272329010865855 , -0.39835936819190537 , -0.09113694760349819 , -0.04061591094832608 , -0.12677021961235907 , 0.015567707226741051 , -0.005615051546243333 , -0.06454044862001587 , 0.0195457674752272 , -0.04219686517155871 , -0.08060569979524296 , 0.027234494361702787 , -0.009152881336047056 , -0.030865118003992217 , -0.005770311060090559 , 0.002905833371986098 , 5.606663556872091e-05 , 0.003209538083839772 , -0.0018588810743365345 , -0.0 ], [ 0.007587008852984699 , -0.0021213639853557625 , 0.0007709558092903736 , 0.013883256128746423 , 0.017328713012428214 , 0.03645357525636198 , -0.04043993335238427 , 0.05730125171252314 , -0.2563293727512057 , -0.11438826083879326 , 0.02662382809034687 , 0.03525271352483709 , 0.04745678120172762 , 0.0336360484090392 , -0.002916635707204059 , -0.17950855098650784 , -0.44161773297052964 , -0.4512180227831197 , -0.4940283106297913 , -0.1970108671285798 , 0.04344323143078066 , -0.012005120444897523 , 0.00987576109166055 , -0.0018336757466252476 , 0.0004913959502151706 , -0.0005409724034216215 , -0.005039223900868212 , -0.0 ], [ 0.00637876531169957 , 0.005189469227685454 , 0.0007676355246000376 , 0.018378100865097655 , 0.015739815031394887 , -0.035524983116512455 , 0.03781006978038308 , 0.28859052096740495 , 0.0726464110153121 , -0.026768468497420147 , 0.06278766200288134 , 0.17897045813699355 , -0.13780371920803108 , -0.14176458123649577 , -0.1733103177731656 , -0.3106508869296763 , 0.04788355140275794 , 0.04235327890285105 , -0.031266625292514394 , -0.016263819217960652 , -0.031388328800811355 , -0.01791363975905968 , -0.012025067979443894 , 0.008335083985905805 , -0.0014386677797296231 , 0.0055376544652972854 , 0.002241522815466253 , -0.0 ], [ 0.007455256326741617 , -0.0009475207572210404 , 0.0020288385162615286 , 0.015399640135796092 , 0.021133843188103074 , -0.019846405097622234 , -0.003162485751163173 , -0.14199005055318842 , -0.044200898667146035 , -0.013395459413208084 , 0.11019680479230103 , -0.014057216041764874 , -0.12553853334447865 , -0.05992513534766256 , 0.06467942189539834 , 0.08866056095907732 , -0.1451321508061849 , -0.07382491447758655 , -0.046961739981080476 , 0.0008943713493160624 , 0.03231044103656507 , 0.00036034241706501196 , -0.011387669277619417 , -0.00014602449257226195 , -0.0021863729003374116 , 0.0018817840156005856 , 0.0037909804578166286 , -0.0 ], [ 0.006511855618626698 , 0.006236866054439829 , -0.001440571166157676 , 0.012795776609942026 , 0.011530545030403624 , 0.03495489377257363 , 0.04792403136095304 , 0.049378583599065225 , 0.03296101702085617 , -0.0005351385876652296 , 0.017744115897640366 , 0.0011656622496764954 , 0.0232845869823761 , -0.0561191397060232 , -0.02854070511118366 , -0.028614174047247348 , -0.007763531086362863 , 0.01823079560098924 , 0.021961392405283622 , -0.009666681805706179 , 0.009547046884328725 , -0.008729943263791338 , 0.006408909680578429 , 0.009794327096359952 , -0.0025825219195515304 , 0.007063559189211571 , 0.007867244119267047 , -0.0 ], [ 0.007936663546039311 , -0.00010710180170593153 , 0.002716512705673228 , 0.0038633557307721487 , -0.0014877316616940372 , -0.0004788143065635909 , 0.012508842248031202 , 0.0045381104608414645 , -0.010650910516128294 , -0.013785341529644855 , -0.034287643221318206 , -0.022152707546335495 , -0.047056481347685974 , -0.032166744564720455 , -0.021551611335278546 , -0.002174962503376043 , 0.024344287130424306 , 0.015579272560525105 , 0.010958169741952194 , -0.010607232913436921 , -0.005548369726118836 , -0.0014630046444242706 , 0.013144180105016433 , 0.0031349366359021916 , 0.0010984887428255974 , 0.005426941473328394 , 0.006566511860044785 , -0.0 ], [ 0.0005529184874606495 , 0.00026139355020588705 , -0.002887623443531047 , 0.0013988462990850632 , 0.00203365139495493 , -0.007276926701775218 , -0.004010419939595932 , 0.017521952161185662 , 0.0006996977433557911 , 0.02083134683611201 , 0.013690533534289498 , -0.005466724359976675 , -0.008857712321334327 , 0.017408578822635818 , 0.0076439343049154425 , 0.0017861314923539985 , 0.007465865707523924 , 0.008034420825988495 , 0.003976298558337994 , 0.00411970637898539 , -0.004572592545819698 , 0.0029563907011979935 , -0.0006382227820088148 , 0.0015153753877889707 , -0.0052626601797995595 , 0.0025664706985019416 , 0.005161751034260073 , -0.0 ], [ 0.0009424280561998445 , -0.0012942360298110595 , 0.0011900868416523343 , 0.000984424113178899 , 0.0020988269382781564 , -0.005870080062890889 , -0.004950484744457169 , 0.003117643454332697 , -0.002509563565777083 , 0.005831604884101081 , 0.009531085216183116 , 0.010030206821909806 , 0.005858190171099734 , 4.9344529936340524e-05 , -0.004027895832421331 , 0.0025436439920587606 , 0.00531153867563076 , 0.00495942692369508 , 0.009215148318606382 , 0.00010011928317543458 , 0.0060051362999805355 , -0.0008195376963202741 , 0.0041728603512658224 , -0.0017597169567888774 , -0.0010577007775543158 , 0.00046033327178068433 , -0.0007674196306044449 , -0.0 ], [ -0.0 , -0.0 , 0.0013386963856532302 , 0.00035183178922260837 , 0.0030610334903526204 , 8.951834979315781e-05 , 0.0023676793550483524 , -0.0002900551076915047 , -0.00207019445286608 , -7.61697478482574e-05 , 0.0012150086715244216 , 0.009831239281792168 , 0.003479667642621962 , 0.0070584324334114525 , 0.004161851261339585 , 0.0026146296354490665 , -9.194746959222099e-05 , 0.0013583866966571571 , 0.0016821551239318913 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 ]]]]}","title":"Model Explanation"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-v1-grpc-protocol","text":"Note : Since kserve has no grpc client methods for v1, we are using torchserve's grpc v1 client","title":"Deploy PyTorch Model with V1 gRPC Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice","text":"For deploying the InferenceService with gRPC protocol you need to expose the gRPC port on InferenceService. Here 7070 is torchserve gRPC port. Apply the following mnist_grpc.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc.yaml Expected Output inferenceservice.serving.kserve.io/torchserve-grpc created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#run-inference-with-torchserve-grpc-protocol","text":"Install gRPC python dependencies: pip install -U grpcio protobuf grpcio-tools Download TorchServe's inference and management proto: mkdir -p proto/v1 INFERENCE_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/inference.proto MANAGEMENT_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/management.proto curl -s -L ${ INFERENCE_PROTO_FILE_PATH } > ./proto/v1/inference.proto curl -s -L ${ MANAGEMENT_PROTO_FILE_PATH } > ./proto/v1/management.proto Generate python gRPC client stub using the proto files: python -m grpc_tools.protoc --proto_path = proto/v1/ --python_out = . --grpc_python_out = . proto/v1/inference.proto proto/v1/management.proto You can use image converter to convert the images to base64 byte array, for other models please refer to input request . Run gRPC Inference using torchserve_grpc_client.py with mnist.json as an example prediction input. MODEL_NAME = mnist INPUT_PATH = mnist.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python torchserve_grpc_client.py --api_name infer --model $MODEL_NAME --input_path $INPUT_PATH --hostname $SERVICE_HOSTNAME Expected Output { \"predictions\" : [ 2 ] }","title":"Run Inference with TorchServe gRPC protocol"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-open-inference-rest-protocol","text":"","title":"Deploy PyTorch model with Open Inference REST Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice_1","text":"KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec and enables the KServe v1 inference protocol. To enable v2 open inference protocol, specify the protocolVersion field with the value v2 . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : pytorch : protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 For deploying the model on CPU, apply the mnist_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_v2.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve-mnist-v2 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-mnist-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can send both byte array and tensor with v2 protocol, for byte array use image converter to convert the image to byte array input. Here we use the mnist_v2_bytes.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2_bytes.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} For tensor input use the tensor image converter to convert the image to tensor input and here we use the mnist_v2.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2.json Expected Output { \"id\" : \"2266ec1e-f600-40af-97b5-7429b8195a80\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]}","title":"Model Inference"},{"location":"modelserving/v1beta1/torchserve/#model-explanation_1","text":"To get the model explanation with v2 explain endpoint: MODEL_NAME = mnist curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mnist/explain -d @./mnist_v2.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"explain\" , \"shape\" : [ 1 , 28 , 28 ], \"datatype\" : \"FP64\" , \"data\" : [ -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0040547528781588035 , -0.00022612877200043775 , -0.0001273413606783097 , 0.005648369508785856 , 0.008904784451506994 , 0.0026385365879584796 , 0.0026802458602499875 , -0.002657801604900743 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.00024465772895309256 , 0.0008218449738666515 , 0.015285917610467934 , 0.007512832227517626 , 0.007094984753782517 , 0.003405668751094489 , -0.0020919252360163056 , -0.00078002938659872 , 0.02299587777864007 , 0.01900432942654754 , -0.001252955497754338 , -0.0014666116894338772 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.005298396384926053 , -0.0007901605067151054 , 0.0039060659788228954 , 0.02317408211645009 , 0.017237917554858186 , 0.010867034286601965 , 0.003001563092717309 , 0.00622421762838887 , 0.006120712336480808 , 0.016736329175541464 , 0.005674718838256385 , 0.004344134814439431 , -0.001232842177319105 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0006867353660007012 , 0.00977289933298656 , -0.003875493166540815 , 0.0017986937404117591 , 0.0013075440157543057 , -0.0024510980461748236 , -0.0008806773426546923 , -0.0 , -0.0 , -0.00014277890422995419 , -0.009322313284511257 , 0.020608317953885236 , 0.004351394739722548 , -0.0007875565409186222 , -0.0009075897751127677 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.00022247237111456804 , -0.0007829031603535926 , 0.002666369539125161 , 0.000973336852105775 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.000432321003928822 , 0.023657172129172684 , 0.010694844898905204 , -0.002375952975746018 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0020747972047037 , -0.002320101258915877 , -0.0012899205783904548 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.007629679655402933 , 0.01044862724376463 , 0.00025032878924736025 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.00037708370104137974 , -0.005156369275302328 , 0.0012477582442296628 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -4.442516083381132e-05 , 0.01024804634283815 , 0.0009971135240970147 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0004501048968956462 , -0.0019630535686311007 , -0.0006664793297549408 , 0.0020157403539278907 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0022144569383238466 , 0.008361583574785395 , 0.00314019428604999 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0028943544591141838 , -0.0031301383432286406 , 0.002113252872926688 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0010321050605717045 , 0.008905753926369048 , 0.002846438277738756 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.005305288883499087 , -0.00192711009725932 , 0.0012090042768467344 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0011945156500241256 , 0.005654442715832439 , 0.0020132075345016807 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0014689356969061985 , 0.0010743412638183228 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0017047980586912376 , 0.00290660517425009 , -0.0007805869640505143 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 5.541 * Co nne c t io n # 0 t o hos t localhos t le ft i nta c t 725422148614e-05 , 0.0014516114512869852 , 0.0002827701966546988 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0014401407633627265 , 0.0023812497776698745 , 0.002146825301700187 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0011500529125940918 , 0.0002865015572973405 , 0.0029798151042282686 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0017750295500283872 , 0.0008339859126060243 , -0.00377073933577687 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0006093176894575109 , -0.00046905787892409935 , 0.0034053218511795034 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0007450011768391558 , 0.001298767372877851 , -0.008499247640112315 , -6.145166131400234e-05 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0011809726042792137 , -0.001838476328106708 , 0.00541110661116898 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.002139234224224006 , 0.0003259163407641124 , -0.005276118873855287 , -0.001950984007438105 , -9.545670742026532e-07 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0007772404228681039 , -0.0001517956264720738 , 0.0064814848131711815 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 8.098064985902114e-05 , -0.00249042660692983 , -0.0020718619200672302 , -5.341117902942147e-05 , -0.00045564724429915073 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0022750983476959733 , 0.0017164060958460778 , 0.0003221344707738082 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0015560282678744543 , 9.107238495871273e-05 , 0.0008772841497928399 , 0.0006502978626355868 , -0.004128780767525651 , 0.0006030386900152659 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.001395995791096219 , 0.0026791526689584344 , 0.0023995008266391488 , -0.0004496096312746451 , 0.003101832450753724 , 0.007494536066960778 , 0.0028641187148287965 , -0.0030525907182629075 , 0.003420222396518567 , 0.0014924018363498125 , -0.0009357388301326025 , 0.0007856228933169799 , -0.0018433973914981437 , 1.6031856831240914e-05 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0006999018502034005 , 0.004382250870697946 , -0.0035419313267119365 , -0.0028896748092595375 , -0.00048734542493666705 , -0.0060873452419295 , 0.000388224990424471 , 0.002533641537585585 , -0.004352836563597573 , -0.0006079418766875505 , -0.0038101334053377753 , -0.000828441340357984 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0010901530866342661 , -0.013135008038845744 , 0.0004734518707654666 , 0.002050423283568135 , -0.006609451922460863 , 0.0023647861820124366 , 0.0046789204256194 , -0.0018122527412311837 , 0.002137538353955849 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ]}]}","title":"Model Explanation"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-open-inference-grpc-protocol","text":"","title":"Deploy PyTorch Model with Open Inference gRPC Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice_2","text":"For deploying the InferenceService with Open Inference gRPC Protocol you need to expose the gRPC port on InferenceService. Here 8081 is kserve gRPC port. Apply the following mnist_grpc_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc_v2.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-grpc-v2 created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : pytorch : protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#run-grpc-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Then download Open Inference gRPC proto file: mkdir -p proto/v2 PROTO_FILE_PATH = https://raw.githubusercontent.com/kserve/kserve/master/python/kserve/kserve/protocol/grpc/grpc_predict_v2.proto curl -s -L ${ PROTO_FILE_PATH } > ./proto/v2/grpc_predict_v2.proto Run the inference test with grpcurl: INPUT_PATH = ./mnist_v2_grpc_tensor.json PROTO_FILE = proto/v2/grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) grpcurl -v -plaintext -proto ${ PROTO_FILE } -authority ${ SERVICE_HOSTNAME } -d @ ${ INGRESS_HOST } : ${ INGRESS_PORT } inference.GRPCInferenceService.ModelInfer <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Wed, 11 Oct 2023 13 :36:30 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 581 Response contents: { \"modelName\" : \"mnist\" , \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"outputs\" : [ { \"name\" : \"input-0\" , \"datatype\" : \"INT64\" , \"shape\" : [ \"1\" ] , \"contents\" : { \"int64Contents\" : [ \"1\" ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Run gRPC Inference"},{"location":"modelserving/v1beta1/torchserve/#autoscaling","text":"One of the main serverless inference features is to automatically scale the replicas of an InferenceService matching the incoming workload. KServe by default enables Knative Pod Autoscaler which watches traffic flow and scales up and down based on the configured metrics.","title":"Autoscaling"},{"location":"modelserving/v1beta1/torchserve/#knative-autoscaler","text":"KServe supports the implementation of Knative Pod Autoscaler (KPA) and Kubernetes\u2019 Horizontal Pod Autoscaler (HPA) . The features and limitations of each of these Autoscalers are listed below. Note If you want to use Kubernetes Horizontal Pod Autoscaler (HPA), you must install HPA extension Knative Pod Autoscaler (KPA) Part of the Knative Serving core and enabled by default once Knative Serving is installed. Supports scale to zero functionality. Does not support CPU-based autoscaling. Horizontal Pod Autoscaler (HPA) Not part of the Knative Serving core, and must be enabled after Knative Serving installation. Does not support scale to zero functionality. Supports CPU-based autoscaling.","title":"Knative Autoscaler"},{"location":"modelserving/v1beta1/torchserve/#create-inferenceservice-with-concurrency-target","text":"","title":"Create InferenceService with Concurrency Target"},{"location":"modelserving/v1beta1/torchserve/#hardsoft-autoscaling-limit","text":"You can configure InferenceService with annotation autoscaling.knative.dev/target for a soft limit. The soft limit is a targeted limit rather than a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" You can also configure InferenceService with field containerConcurrency with a hard limit. The hard limit is an enforced upper bound. If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" After specifying the soft or hard limits of the scaling target, you can now deploy the InferenceService with autoscaling.yaml . kubectl kubectl apply -f autoscaling.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created","title":"Hard/Soft Autoscaling Limit"},{"location":"modelserving/v1beta1/torchserve/#run-inference-with-concurrent-requests","text":"The first step is to install the hey load generator and then send the concurrent requests to the InferenceService . go get -u github.com/rakyll/hey MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -m POST -z 30s -D ./mnist.json -host ${ SERVICE_HOSTNAME } http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict","title":"Run Inference with Concurrent Requests"},{"location":"modelserving/v1beta1/torchserve/#check-pod-autoscaling","text":"hey by default generates 50 requests concurrently, so you can see that the InferenceService scales to 5 pods as the container concurrency target is set to 10. Expected Output NAME READY STATUS RESTARTS AGE torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb 2 /2 Terminating 0 103s torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8 2 /2 Terminating 0 95s torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq 2 /2 Running 0 50m torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2 /2 Running 0 113s torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2 /2 Running 0 109s torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t 2 /2 Terminating 0 103s","title":"Check Pod Autoscaling"},{"location":"modelserving/v1beta1/torchserve/#canary-rollout","text":"Canary rollout is a deployment strategy when you release a new version of model to a small percent of the production traffic.","title":"Canary Rollout"},{"location":"modelserving/v1beta1/torchserve/#create-inferenceservice-with-canary-model","text":"After the above experiments, now let's see how you can rollout a new model without moving full traffic to the new model by default. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" In this example we change the storageUri to the v2 version with canaryTrafficPercent field and then apply the canary.yaml . kubectl kubectl apply -f canary.yaml Expected Output kubectl get revisions -l serving.kserve.io/inferenceservice = torchserve NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON ACTUAL REPLICAS DESIRED REPLICAS torchserve-predictor-default-00001 torchserve-predictor-default 1 True 1 1 torchserve-predictor-default-00002 torchserve-predictor-default 2 True 1 1 kubectl get pods -l serving.kserve.io/inferenceservice = torchserve NAME READY STATUS RESTARTS AGE torchserve-predictor-default-00001-deployment-7d99979c99-p49gk 2 /2 Running 0 28m torchserve-predictor-default-00002-deployment-c6fcc65dd-rjknq 2 /2 Running 0 3m37s","title":"Create InferenceService with Canary Model"},{"location":"modelserving/v1beta1/torchserve/#check-traffic-status","text":"After the canary model is rolled out, the traffic should be split between the canary model revision and the \"stable\" revision which was rolled out with 100% percent traffic, now check the traffic split from the InferenceService traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 20 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 80 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Check Traffic Status"},{"location":"modelserving/v1beta1/torchserve/#traffic-rollout","text":"Run the following curl requests a few times to the InferenceService , you can see that requests are sent to the two revisions with 20/80 splits. MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) for i in { 1 ..10 } ; do curl -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json ; done Expected Output { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 <html><title>500: Internal Server Error</title><body>500: Internal Server Error</body></html>Handling connection for 8080 <html><title>500: Internal Server Error</title><body>500: Internal Server Error</body></html>Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 You can notice that when the request hits the canary revision it fails, this is because that the new revision requires the v2 inference input mnist_v2.json which is a breaking change, in addition the traffic is randomly splitted between the two revisions according to the specified traffic percentage. In this case you should rollout the canary model with 0 canaryTrafficPercent and use the latest tagged url to test the canary model before moving the full traffic to the new model. kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' curl -v -H \"Host: latest-torchserve-predictor-default.default.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [ 1 ], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} After the new model is tested and verified, you can now bump the canaryTrafficPercent to 100 to fully rollout the traffic to the new revision and now the latestRolledoutRevision becomes torchserve-predictor-default-00002 and previousRolledoutRevision becomes torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 100}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00002\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Traffic Rollout"},{"location":"modelserving/v1beta1/torchserve/#rollback-the-model","text":"In case the new model version does not work after the traffic is moved to the new revision, you can still patch the canaryTrafficPercent to 0 and move the traffic back to the previously rolled model which is torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 0 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Rollback the Model"},{"location":"modelserving/v1beta1/torchserve/#monitoring","text":"Metrics Exposure and Grafana Dashboard Setup","title":"Monitoring"},{"location":"modelserving/v1beta1/torchserve/bert/","text":"TorchServe example with Huggingface bert model \u00b6 In this example we will show how to serve Huggingface Transformers with TorchServe on KServe. Model archive file creation \u00b6 Clone pytorch/serve repository, navigate to examples/Huggingface_Transformers and follow the steps for creating the MAR file including serialized model and other dependent files. TorchServe supports both eager model and torchscript and here we save as the pretrained model. torch-model-archiver --model-name BERTSeqClassification --version 1 .0 \\ --serialized-file Transformer_model/pytorch_model.bin \\ --handler ./Transformer_handler_generalized.py \\ --extra-files \"Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json\" Create the InferenceService \u00b6 Apply the CRD kubectl apply -f bert.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-bert created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:predict -d ./sample_text.txt Expected Output * Trying 44 .239.20.204... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 44 .239.20.204 ) port 80 ( #0) > PUT /v1/models/BERTSeqClassification:predict HTTP/1.1 > Host: torchserve-bert.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 79 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 8 < date: Wed, 04 Nov 2020 10 :54:49 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 4b54d3ac-185f-444c-b344-b8a785fdeb50 < x-envoy-upstream-service-time: 2085 < server: istio-envoy < * Connection #0 to host torchserve-bert.kserve-test.example.com left intact Accepted Captum Explanations \u00b6 In order to understand the word importances and attributions when we make an explanation Request, we use Captum Insights for the Hugginface Transformers pre-trained model. MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:explaine -d ./sample_text.txt Expected output * Trying ::1:8080... * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/BERTSeqClassification:explain HTTP/1.1 > Host: torchserve-bert.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded >Handling connection for 8080 * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 292 < content-type: application/json ; charset = UTF-8 < date: Sun, 27 Dec 2020 05 :53:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 5769 < * Connection #0 to host localhost left intact { \"explanations\" : [{ \"importances\" : [ 0 .0, -0.6324463574494716, -0.033115653530477414, 0 .2681695752722339, -0.29124745608778546, 0 .5422589681903883, -0.3848768219546909, 0 .0 ] , \"words\" : [ \"[CLS]\" , \"bloomberg\" , \"has\" , \"reported\" , \"on\" , \"the\" , \"economy\" , \"[SEP]\" ] , \"delta\" : -0.0007350619859377225 }]}","title":"TorchServe example with Huggingface bert model"},{"location":"modelserving/v1beta1/torchserve/bert/#torchserve-example-with-huggingface-bert-model","text":"In this example we will show how to serve Huggingface Transformers with TorchServe on KServe.","title":"TorchServe example with Huggingface bert model"},{"location":"modelserving/v1beta1/torchserve/bert/#model-archive-file-creation","text":"Clone pytorch/serve repository, navigate to examples/Huggingface_Transformers and follow the steps for creating the MAR file including serialized model and other dependent files. TorchServe supports both eager model and torchscript and here we save as the pretrained model. torch-model-archiver --model-name BERTSeqClassification --version 1 .0 \\ --serialized-file Transformer_model/pytorch_model.bin \\ --handler ./Transformer_handler_generalized.py \\ --extra-files \"Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json\"","title":"Model archive file creation"},{"location":"modelserving/v1beta1/torchserve/bert/#create-the-inferenceservice","text":"Apply the CRD kubectl apply -f bert.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-bert created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/bert/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:predict -d ./sample_text.txt Expected Output * Trying 44 .239.20.204... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 44 .239.20.204 ) port 80 ( #0) > PUT /v1/models/BERTSeqClassification:predict HTTP/1.1 > Host: torchserve-bert.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 79 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 8 < date: Wed, 04 Nov 2020 10 :54:49 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 4b54d3ac-185f-444c-b344-b8a785fdeb50 < x-envoy-upstream-service-time: 2085 < server: istio-envoy < * Connection #0 to host torchserve-bert.kserve-test.example.com left intact Accepted","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/bert/#captum-explanations","text":"In order to understand the word importances and attributions when we make an explanation Request, we use Captum Insights for the Hugginface Transformers pre-trained model. MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:explaine -d ./sample_text.txt Expected output * Trying ::1:8080... * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/BERTSeqClassification:explain HTTP/1.1 > Host: torchserve-bert.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded >Handling connection for 8080 * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 292 < content-type: application/json ; charset = UTF-8 < date: Sun, 27 Dec 2020 05 :53:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 5769 < * Connection #0 to host localhost left intact { \"explanations\" : [{ \"importances\" : [ 0 .0, -0.6324463574494716, -0.033115653530477414, 0 .2681695752722339, -0.29124745608778546, 0 .5422589681903883, -0.3848768219546909, 0 .0 ] , \"words\" : [ \"[CLS]\" , \"bloomberg\" , \"has\" , \"reported\" , \"on\" , \"the\" , \"economy\" , \"[SEP]\" ] , \"delta\" : -0.0007350619859377225 }]}","title":"Captum Explanations"},{"location":"modelserving/v1beta1/torchserve/metrics/","text":"Expose TorchServe Metrics \u00b6 This tutorial setups prometheus and granfana to the cluster with TorchServe metrics. Install Istio with Grafana and Prometheus \u00b6 Note: Make sure to enable prometheus and grafana while installing istio. After installation Grafana and Prometheus can be accessed from the below links # Grafana istioctl dashboard grafana # Prometheus istioctl dashboard prometheus Create the InferenceService \u00b6 Enable prometheus scraping by adding annotations to deployment yaml, by default the torchserve's metrics port is 8082. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 Apply the CRD kubectl apply -f metrics.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-metrics created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torch-metrics <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist:predict HTTP/1.1 > Host: torch-metrics.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 272 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Fri, 23 Oct 2020 13 :01:09 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 8881f2b9-462e-4e2d-972f-90b4eb083e53 < x-envoy-upstream-service-time: 5018 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]} Check the dashboard \u00b6 Prometheus graph view \u00b6 Navigate to prometheus page Add a query in the prometheus page Grafana dashboard \u00b6 Navigate to grafana page Add a dashboard from the top left + symbol Click add query and enter the query For Exposing grafana and prometheus under istio ingress please refer to remotely accessing telemetry addons Apply below deployment for a demo setup. apiVersion : networking.istio.io/v1alpha3 kind : Gateway metadata : name : grafana-gateway namespace : istio-system spec : selector : istio : ingressgateway servers : - port : number : 80 name : http-grafana protocol : HTTP hosts : - \"grafana.example.com\" --- apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : grafana-vs namespace : istio-system spec : hosts : - \"grafana.example.com\" gateways : - grafana-gateway http : - route : - destination : host : grafana port : number : 3000 --- apiVersion : networking.istio.io/v1alpha3 kind : DestinationRule metadata : name : grafana namespace : istio-system spec : host : grafana trafficPolicy : tls : mode : DISABLE --- All request with hostname grafana.example.com redirects to grafana.","title":"Expose TorchServe Metrics"},{"location":"modelserving/v1beta1/torchserve/metrics/#expose-torchserve-metrics","text":"This tutorial setups prometheus and granfana to the cluster with TorchServe metrics.","title":"Expose TorchServe Metrics"},{"location":"modelserving/v1beta1/torchserve/metrics/#install-istio-with-grafana-and-prometheus","text":"Note: Make sure to enable prometheus and grafana while installing istio. After installation Grafana and Prometheus can be accessed from the below links # Grafana istioctl dashboard grafana # Prometheus istioctl dashboard prometheus","title":"Install Istio with Grafana and Prometheus"},{"location":"modelserving/v1beta1/torchserve/metrics/#create-the-inferenceservice","text":"Enable prometheus scraping by adding annotations to deployment yaml, by default the torchserve's metrics port is 8082. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 Apply the CRD kubectl apply -f metrics.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-metrics created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/metrics/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torch-metrics <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist:predict HTTP/1.1 > Host: torch-metrics.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 272 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Fri, 23 Oct 2020 13 :01:09 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 8881f2b9-462e-4e2d-972f-90b4eb083e53 < x-envoy-upstream-service-time: 5018 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/metrics/#check-the-dashboard","text":"","title":"Check the dashboard"},{"location":"modelserving/v1beta1/torchserve/metrics/#prometheus-graph-view","text":"Navigate to prometheus page Add a query in the prometheus page","title":"Prometheus graph view"},{"location":"modelserving/v1beta1/torchserve/metrics/#grafana-dashboard","text":"Navigate to grafana page Add a dashboard from the top left + symbol Click add query and enter the query For Exposing grafana and prometheus under istio ingress please refer to remotely accessing telemetry addons Apply below deployment for a demo setup. apiVersion : networking.istio.io/v1alpha3 kind : Gateway metadata : name : grafana-gateway namespace : istio-system spec : selector : istio : ingressgateway servers : - port : number : 80 name : http-grafana protocol : HTTP hosts : - \"grafana.example.com\" --- apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : grafana-vs namespace : istio-system spec : hosts : - \"grafana.example.com\" gateways : - grafana-gateway http : - route : - destination : host : grafana port : number : 3000 --- apiVersion : networking.istio.io/v1alpha3 kind : DestinationRule metadata : name : grafana namespace : istio-system spec : host : grafana trafficPolicy : tls : mode : DISABLE --- All request with hostname grafana.example.com redirects to grafana.","title":"Grafana dashboard"},{"location":"modelserving/v1beta1/torchserve/model-archiver/","text":"Generate model archiver files for torchserve \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . 1. Create PV and PVC \u00b6 Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage you can refer to AWS EFS storage 1.1 Create PV \u00b6 Edit volume id in pv.yaml file kubectl apply -f pv.yaml Expected Output $ persistentvolume/model-pv-volume created 1.2 Create PVC \u00b6 kubectl apply -f pvc.yaml Expected Output $ persistentvolumeclaim/model-pv-claim created 2 Create model store files layout and copy to PV \u00b6 We create a pod with the PV attached to copy the model files and config.properties for generating model archive file. 2.1 Create pod for copying model store files to PV \u00b6 kubectl apply -f pvpod.yaml Expected Output $ pod/model-store-pod created 2.2 Create model store file layout on PV \u00b6 2.2.1 Create properties.json file \u00b6 This file has model-name, version, model-file name, serialized-file name, extra-files, handlers, workers etc. of the models. [ { \"model-name\" : \"mnist\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"mnist_cnn.pt\" , \"extra-files\" : \"\" , \"handler\" : \"mnist_handler.py\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" }, { \"model-name\" : \"densenet_161\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"densenet161-8d451a50.pth\" , \"extra-files\" : \"index_to_name.json\" , \"handler\" : \"image_classifier\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" } ] 2.2.2 Copy model and its dependent Files \u00b6 Copy all the model and dependent files to the PV in the structure given below. An empty config folder, a model-store folder containing model name as folder name. Within that model folder, the files required to build the marfile. \u251c\u2500\u2500 config \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161 \u2502 \u2502 \u251c\u2500\u2500 densenet161-8d451a50.pth \u2502 \u2502 \u251c\u2500\u2500 index_to_name.json \u2502 \u2502 \u2514\u2500\u2500 model.py \u2502 \u251c\u2500\u2500 mnist \u2502 \u2502 \u251c\u2500\u2500 mnist_cnn.pt \u2502 \u2502 \u251c\u2500\u2500 mnist_handler.py \u2502 \u2502 \u2514\u2500\u2500 mnist.py \u2502 \u2514\u2500\u2500 properties.json 2.2.3 Create folders for model-store and config in PV \u00b6 kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/model-store/ kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/config/ 2.3 Copy model files and config.properties to the PV \u00b6 kubectl cp model-store/* model-store-pod:/pv/model-store/ -c model-store -n kserve-test kubectl cp config.properties model-store-pod:/pv/config/ -c model-store -n kserve-test 2.4 Delete pv pod \u00b6 Since amazon EBS provide only ReadWriteOnce mode, we have to unbind the PV for use of model archiver. kubectl delete pod model-store-pod -n kserve-test 3 Generate model archive file and server configuration file \u00b6 3.1 Create model archive pod and run model archive file generation script \u00b6 kubectl apply -f model-archiver.yaml -n kserve-test 3.2 Check the output and delete model archive pod \u00b6 Verify mar files and config.properties kubectl exec -it margen-pod -n kserve-test -- ls -lR /home/model-server/model-store kubectl exec -it margen-pod -n kserve-test -- cat /home/model-server/config/config.properties 3.3 Delete model archiver \u00b6 kubectl delete -f model-archiver.yaml -n kserve-test","title":"Generate model archiver files for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#generate-model-archiver-files-for-torchserve","text":"","title":"Generate model archiver files for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#setup","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible .","title":"Setup"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#1-create-pv-and-pvc","text":"Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage you can refer to AWS EFS storage","title":"1. Create PV and PVC"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#11-create-pv","text":"Edit volume id in pv.yaml file kubectl apply -f pv.yaml Expected Output $ persistentvolume/model-pv-volume created","title":"1.1 Create PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#12-create-pvc","text":"kubectl apply -f pvc.yaml Expected Output $ persistentvolumeclaim/model-pv-claim created","title":"1.2 Create PVC"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#2-create-model-store-files-layout-and-copy-to-pv","text":"We create a pod with the PV attached to copy the model files and config.properties for generating model archive file.","title":"2 Create model store files layout and copy to PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#21-create-pod-for-copying-model-store-files-to-pv","text":"kubectl apply -f pvpod.yaml Expected Output $ pod/model-store-pod created","title":"2.1 Create pod for copying model store files to PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#22-create-model-store-file-layout-on-pv","text":"","title":"2.2 Create model store file layout on PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#221-create-propertiesjson-file","text":"This file has model-name, version, model-file name, serialized-file name, extra-files, handlers, workers etc. of the models. [ { \"model-name\" : \"mnist\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"mnist_cnn.pt\" , \"extra-files\" : \"\" , \"handler\" : \"mnist_handler.py\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" }, { \"model-name\" : \"densenet_161\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"densenet161-8d451a50.pth\" , \"extra-files\" : \"index_to_name.json\" , \"handler\" : \"image_classifier\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" } ]","title":"2.2.1 Create properties.json file"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#222-copy-model-and-its-dependent-files","text":"Copy all the model and dependent files to the PV in the structure given below. An empty config folder, a model-store folder containing model name as folder name. Within that model folder, the files required to build the marfile. \u251c\u2500\u2500 config \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161 \u2502 \u2502 \u251c\u2500\u2500 densenet161-8d451a50.pth \u2502 \u2502 \u251c\u2500\u2500 index_to_name.json \u2502 \u2502 \u2514\u2500\u2500 model.py \u2502 \u251c\u2500\u2500 mnist \u2502 \u2502 \u251c\u2500\u2500 mnist_cnn.pt \u2502 \u2502 \u251c\u2500\u2500 mnist_handler.py \u2502 \u2502 \u2514\u2500\u2500 mnist.py \u2502 \u2514\u2500\u2500 properties.json","title":"2.2.2 Copy model and its dependent Files"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#223-create-folders-for-model-store-and-config-in-pv","text":"kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/model-store/ kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/config/","title":"2.2.3 Create folders for model-store and config in PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#23-copy-model-files-and-configproperties-to-the-pv","text":"kubectl cp model-store/* model-store-pod:/pv/model-store/ -c model-store -n kserve-test kubectl cp config.properties model-store-pod:/pv/config/ -c model-store -n kserve-test","title":"2.3 Copy model files and config.properties to the PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#24-delete-pv-pod","text":"Since amazon EBS provide only ReadWriteOnce mode, we have to unbind the PV for use of model archiver. kubectl delete pod model-store-pod -n kserve-test","title":"2.4 Delete pv pod"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#3-generate-model-archive-file-and-server-configuration-file","text":"","title":"3 Generate model archive file and server configuration file"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#31-create-model-archive-pod-and-run-model-archive-file-generation-script","text":"kubectl apply -f model-archiver.yaml -n kserve-test","title":"3.1 Create model archive pod and run model archive file generation script"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#32-check-the-output-and-delete-model-archive-pod","text":"Verify mar files and config.properties kubectl exec -it margen-pod -n kserve-test -- ls -lR /home/model-server/model-store kubectl exec -it margen-pod -n kserve-test -- cat /home/model-server/config/config.properties","title":"3.2 Check the output and delete model archive pod"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#33-delete-model-archiver","text":"kubectl delete -f model-archiver.yaml -n kserve-test","title":"3.3 Delete model archiver"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/","text":"Model archiver for torchserve \u00b6 Steps: Modify config in entrypoint for default config (optional) Build docker image Push docker image to repo docker build --file Dockerfile -t margen:latest . docker tag margen:latest { username } /margen:latest docker push { username } /margen:latest","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/#model-archiver-for-torchserve","text":"Steps: Modify config in entrypoint for default config (optional) Build docker image Push docker image to repo docker build --file Dockerfile -t margen:latest . docker tag margen:latest { username } /margen:latest docker push { username } /margen:latest","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/","text":"Model archiver for torchserve \u00b6 Place all the file required to grenerate marfile in the model folder \u00b6","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/#model-archiver-for-torchserve","text":"","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/#place-all-the-file-required-to-grenerate-marfile-in-the-model-folder","text":"","title":"Place all the file required to grenerate marfile in the model folder"},{"location":"modelserving/v1beta1/transformer/collocation/","text":"Collocate transformer and predictor in same pod \u00b6 KServe by default deploys the Transformer and Predictor as separate services, allowing you to deploy them on different devices and scale them independently. Nevertheless, there are certain situations where you might prefer to collocate the transformer and predictor within the same pod. Here are a few scenarios: If your transformer is tightly coupled with the predictor and you want to perform canary deployment together. If you want to reduce sidecar resources. If you want to reduce networking latency. Before you begin \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Deploy the InferenceService \u00b6 Since, the predictor and the transformer are in the same pod, they need to listen on different ports to avoid conflict. Transformer is configured to listen on port 8080 (REST) and 8081 (GRPC) while, Predictor listens on port 8085 (REST). Transformer calls Predictor on port 8085 via local socket. Deploy the Inferenceservice using the below command. cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: custom-transformer-collocation spec: predictor: containers: - name: kserve-container # Do not change the name; This should be the predictor container image: \"pytorch/torchserve:0.9.0-cpu\" args: - \"torchserve\" - \"--start\" - \"--model-store=/mnt/models/model-store\" - \"--ts-config=/mnt/models/config/config.properties\" env: - name: TS_SERVICE_ENVELOPE value: kserve - name: STORAGE_URI # This will trigger storage initializer; Should be only present in predictor container value: \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi - name: transformer-container # Do not change the container name image: kserve/image-transformer:latest args: - --model_name=mnist - --protocol=v1 # protocol of the predictor; used for converting the input to specific protocol supported by the predictor - --http_port=8080 - --grpc_port=8081 - --predictor_host=localhost:8085 # predictor listening port ports: - containerPort: 8080 protocol: TCP resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi EOF Expected output $ inferenceservice.serving.kserve.io/custom-transformer-collocation created Warning Always use the transformer container name as transformer-container . Otherwise, the model volume is not mounted to the transformer container which may result in an error. Warning Always use the predictor container name as kserve-container . Kserve internally uses this name to find out the predictor. The storage uri should be only present in this container. If it is specified in the transformer container the isvc creation will fail. Note Currently, The collocation support is limited to the custom container spec for kserve model container. Note In Serverless mode, Specifying ports for predictor will result in isvc creation failure as specifying multiple ports is not supported by knative. Due to this limitation predictor cannot be exposed to the outside cluster. For more info see, knative discussion on multiple ports . Check InferenceService status \u00b6 kubectl get isvc custom-transformer-collocation Expected output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE custom-transformer-collocation http://custom-transformer-collocation.default.example.com True 100 custom-transformer-collocation-predictor-00001 133m Note If your DNS contains svc.cluster.local , then Inferenceservice is not exposed through Ingress. you need to configure DNS or use a custom domain in order to expose the isvc . Run a prediction \u00b6 Prepare the inputs for the inference request. Copy the following Json into a file named input.json . Now, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = custom-transformer-collocation MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use curl to send the inference request as: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output * Trying 127 .0.0.1:8080... * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/mnist:predict HTTP/1.1 > Host: custom-transformer-collocation.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Type: application/json > Content-Length: 427 > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 19 < content-type: application/json < date: Sat, 02 Dec 2023 09 :13:16 GMT < server: istio-envoy < x-envoy-upstream-service-time: 315 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Collocate transformer and predictor"},{"location":"modelserving/v1beta1/transformer/collocation/#collocate-transformer-and-predictor-in-same-pod","text":"KServe by default deploys the Transformer and Predictor as separate services, allowing you to deploy them on different devices and scale them independently. Nevertheless, there are certain situations where you might prefer to collocate the transformer and predictor within the same pod. Here are a few scenarios: If your transformer is tightly coupled with the predictor and you want to perform canary deployment together. If you want to reduce sidecar resources. If you want to reduce networking latency.","title":"Collocate transformer and predictor in same pod"},{"location":"modelserving/v1beta1/transformer/collocation/#before-you-begin","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository.","title":"Before you begin"},{"location":"modelserving/v1beta1/transformer/collocation/#deploy-the-inferenceservice","text":"Since, the predictor and the transformer are in the same pod, they need to listen on different ports to avoid conflict. Transformer is configured to listen on port 8080 (REST) and 8081 (GRPC) while, Predictor listens on port 8085 (REST). Transformer calls Predictor on port 8085 via local socket. Deploy the Inferenceservice using the below command. cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: custom-transformer-collocation spec: predictor: containers: - name: kserve-container # Do not change the name; This should be the predictor container image: \"pytorch/torchserve:0.9.0-cpu\" args: - \"torchserve\" - \"--start\" - \"--model-store=/mnt/models/model-store\" - \"--ts-config=/mnt/models/config/config.properties\" env: - name: TS_SERVICE_ENVELOPE value: kserve - name: STORAGE_URI # This will trigger storage initializer; Should be only present in predictor container value: \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi - name: transformer-container # Do not change the container name image: kserve/image-transformer:latest args: - --model_name=mnist - --protocol=v1 # protocol of the predictor; used for converting the input to specific protocol supported by the predictor - --http_port=8080 - --grpc_port=8081 - --predictor_host=localhost:8085 # predictor listening port ports: - containerPort: 8080 protocol: TCP resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi EOF Expected output $ inferenceservice.serving.kserve.io/custom-transformer-collocation created Warning Always use the transformer container name as transformer-container . Otherwise, the model volume is not mounted to the transformer container which may result in an error. Warning Always use the predictor container name as kserve-container . Kserve internally uses this name to find out the predictor. The storage uri should be only present in this container. If it is specified in the transformer container the isvc creation will fail. Note Currently, The collocation support is limited to the custom container spec for kserve model container. Note In Serverless mode, Specifying ports for predictor will result in isvc creation failure as specifying multiple ports is not supported by knative. Due to this limitation predictor cannot be exposed to the outside cluster. For more info see, knative discussion on multiple ports .","title":"Deploy the InferenceService"},{"location":"modelserving/v1beta1/transformer/collocation/#check-inferenceservice-status","text":"kubectl get isvc custom-transformer-collocation Expected output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE custom-transformer-collocation http://custom-transformer-collocation.default.example.com True 100 custom-transformer-collocation-predictor-00001 133m Note If your DNS contains svc.cluster.local , then Inferenceservice is not exposed through Ingress. you need to configure DNS or use a custom domain in order to expose the isvc .","title":"Check InferenceService status"},{"location":"modelserving/v1beta1/transformer/collocation/#run-a-prediction","text":"Prepare the inputs for the inference request. Copy the following Json into a file named input.json . Now, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = custom-transformer-collocation MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use curl to send the inference request as: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output * Trying 127 .0.0.1:8080... * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/mnist:predict HTTP/1.1 > Host: custom-transformer-collocation.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Type: application/json > Content-Length: 427 > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 19 < content-type: application/json < date: Sat, 02 Dec 2023 09 :13:16 GMT < server: istio-envoy < x-envoy-upstream-service-time: 315 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/feast/","text":"Deploy InferenceService with Transformer using Feast online feature store \u00b6 Transformer is an InferenceService component which does pre/post processing alongside with model inference. In this example, instead of typical input transformation of raw data to tensors, we demonstrate a use case of online feature augmentation as part of preprocessing. We use a Feast Transformer to gather online features, run inference with a SKLearn predictor, and leave post processing as pass-through. Before you begin \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Note This example uses Feast version 0.30.2 Create the Redis server \u00b6 This example uses the Redis as the online store. Deploy the Redis server using the below command. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: name: redis-server spec: replicas: 1 selector: matchLabels: app: redis-server template: metadata: labels: app: redis-server name: redis-server spec: containers: - name: redis-server image: redis args: [ \"--appendonly\", \"yes\" ] ports: - name: redis-server containerPort: 6379 env: - name: ALLOW_EMPTY_PASSWORD value: \"yes\" --- apiVersion: v1 kind: Service metadata: name: redis-service spec: type: LoadBalancer selector: app: redis-server ports: - protocol: TCP port: 6379 targetPort: 6379 EOF Expected output $ deployment.apps/redis-server created $ service/redis-service created Create the Feast server \u00b6 Build Feature Store Initializer docker image \u00b6 The feature store initializer is a init container which initializes a new sample feature repository, populate the online store with sample driver data and copies the feature repository to the volume mount. The feature store initializer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /feature-store-initializer:latest -f feature_store_initializer.Dockerfile . docker push $USERNAME /feature-store-initializer:latest Build Feast server docker image \u00b6 The feast server dockerfile can be found in the code example directory. docker build -t $USERNAME /feast-server:latest -f feast_server.Dockerfile . docker push $USERNAME /feast-server:latest Deploy Feast server \u00b6 Wait until the Redis Deployment is available. Now, update the init container and container's image field in the below command and deploy the Feast server. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: name: feature-server spec: replicas: 1 selector: matchLabels: app: feature-server template: metadata: labels: app: feature-server name: feature-server spec: initContainers: - name: feature-store-initializer image: \"{username}/feature-store-initializer:latest\" volumeMounts: - mountPath: /mnt name: feature-store-volume containers: - name: feature-server image: \"{username}/feast-server:latest\" args: [ -c, /mnt/driver_feature_repo/feature_repo, serve, -h, 0.0.0.0 ] ports: - name: feature-server containerPort: 6566 resources: requests: memory: \"64Mi\" cpu: \"250m\" limits: memory: \"128Mi\" cpu: \"500m\" volumeMounts: - mountPath: /mnt name: feature-store-volume volumes: - name: feature-store-volume emptyDir: sizeLimit: 100Mi --- apiVersion: v1 kind: Service metadata: name: feature-server-service spec: type: LoadBalancer selector: app: feature-server ports: - protocol: TCP port: 6566 targetPort: 6566 EOF Expected output $ deployment.apps/feature-server created $ service/feature-server-service created Create a Transformer with Feast \u00b6 Extend the Model class and implement pre/post processing functions \u00b6 KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, when predictor_host is passed the predict handler by default makes a HTTP call to the predictor url and gets back a response which then passes to postproces handler. KServe automatically fills in the predictor_host for Transformer and handle the call to the Predictor , for gRPC predictor currently you would need to overwrite the predict handler to make the gRPC call. To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. We created a class, DriverTransformer, which extends Model for this driver ranking example. It takes additional arguments for the transformer to interact with Feast: feast_serving_url : The Feast serving URL, in the form of <host_name:port> or <ip:port> entity_id_name : The name of the entity ID for which to retrieve features from the Feast feature store feature_refs : The feature references for the features to be retrieved Build Transformer docker image \u00b6 The driver transformer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /driver-transformer:latest -f driver_transformer.Dockerfile . docker push $USERNAME /driver-transformer:latest Create the InferenceService \u00b6 In the Feast Transformer image we packaged the driver transformer class so KServe knows to use the preprocess implementation to augment inputs with online features before making model inference requests. Then the InferenceService uses SKLearn to serve the driver ranking model , which is trained with Feast offline features, available in a gcs bucket specified under storageUri . Update the container's image field and the feast_serving_url argument to create the InferenceService , which includes a Feast Transformer and a SKLearn Predictor. New Schema Old Schema cat <<EOF | kubectl apply -f - apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-driver-transformer\" spec: transformer: containers: - image: \"kserve/driver-transformer:latest\" name: driver-container command: - \"python\" - \"-m\" - \"driver_transformer\" args: - --feast_serving_url - \"feature-server-service.default.svc.cluster.local:6566\" - --entity_id_name - \"driver_id\" - --feature_refs - \"driver_hourly_stats:conv_rate\" - \"driver_hourly_stats:acc_rate\" - \"driver_hourly_stats:avg_daily_trips\" predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/feast/driver\" EOF cat <<EOF | kubectl apply -f - apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-driver-transformer\" spec: transformer: containers: - image: \"kserve/driver-transformer:latest\" name: driver-container command: - \"python\" - \"-m\" - \"driver_transformer\" args: - --feast_serving_url - \"feature-server-service.default.svc.cluster.local:6566\" - --entity_id_name - \"driver_id\" - --feature_refs - \"driver_hourly_stats:conv_rate\" - \"driver_hourly_stats:acc_rate\" - \"driver_hourly_stats:avg_daily_trips\" predictor: sklearn: storageUri: \"gs://kfserving-examples/models/feast/driver\" EOF Expected output $ inferenceservice.serving.kserve.io/sklearn-driver-transformer created Run a prediction \u00b6 Prepare the inputs for the inference request. Copy the following Json into a file named driver-input.json . { \"instances\" : [[ 1001 ], [ 1002 ], [ 1003 ], [ 1004 ], [ 1005 ]] } Before testing the InferenceService , first check if it is in ready state. Now, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = sklearn-driver-transformer MODEL_NAME = sklearn-driver-transformer INPUT_PATH = @./driver-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output > POST /v1/models/sklearn-driver-transformer:predict HTTP/1.1 > Host: sklearn-driver-transformer.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Length: 57 > Content-Type: application/x-www-form-urlencoded > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 115 < content-type: application/json < date: Thu, 30 Mar 2023 09 :46:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 112 < * Connection #0 to host 1.2.3.4 left intact { \"predictions\" : [ 0 .45905828209879473,1.5118208033011165,0.21514156911776539,0.5555778492605103,0.49638665080127176 ]}","title":"Feast"},{"location":"modelserving/v1beta1/transformer/feast/#deploy-inferenceservice-with-transformer-using-feast-online-feature-store","text":"Transformer is an InferenceService component which does pre/post processing alongside with model inference. In this example, instead of typical input transformation of raw data to tensors, we demonstrate a use case of online feature augmentation as part of preprocessing. We use a Feast Transformer to gather online features, run inference with a SKLearn predictor, and leave post processing as pass-through.","title":"Deploy InferenceService with Transformer using Feast online feature store"},{"location":"modelserving/v1beta1/transformer/feast/#before-you-begin","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Note This example uses Feast version 0.30.2","title":"Before you begin"},{"location":"modelserving/v1beta1/transformer/feast/#create-the-redis-server","text":"This example uses the Redis as the online store. Deploy the Redis server using the below command. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: name: redis-server spec: replicas: 1 selector: matchLabels: app: redis-server template: metadata: labels: app: redis-server name: redis-server spec: containers: - name: redis-server image: redis args: [ \"--appendonly\", \"yes\" ] ports: - name: redis-server containerPort: 6379 env: - name: ALLOW_EMPTY_PASSWORD value: \"yes\" --- apiVersion: v1 kind: Service metadata: name: redis-service spec: type: LoadBalancer selector: app: redis-server ports: - protocol: TCP port: 6379 targetPort: 6379 EOF Expected output $ deployment.apps/redis-server created $ service/redis-service created","title":"Create the Redis server"},{"location":"modelserving/v1beta1/transformer/feast/#create-the-feast-server","text":"","title":"Create the Feast server"},{"location":"modelserving/v1beta1/transformer/feast/#build-feature-store-initializer-docker-image","text":"The feature store initializer is a init container which initializes a new sample feature repository, populate the online store with sample driver data and copies the feature repository to the volume mount. The feature store initializer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /feature-store-initializer:latest -f feature_store_initializer.Dockerfile . docker push $USERNAME /feature-store-initializer:latest","title":"Build Feature Store Initializer docker image"},{"location":"modelserving/v1beta1/transformer/feast/#build-feast-server-docker-image","text":"The feast server dockerfile can be found in the code example directory. docker build -t $USERNAME /feast-server:latest -f feast_server.Dockerfile . docker push $USERNAME /feast-server:latest","title":"Build Feast server docker image"},{"location":"modelserving/v1beta1/transformer/feast/#deploy-feast-server","text":"Wait until the Redis Deployment is available. Now, update the init container and container's image field in the below command and deploy the Feast server. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: name: feature-server spec: replicas: 1 selector: matchLabels: app: feature-server template: metadata: labels: app: feature-server name: feature-server spec: initContainers: - name: feature-store-initializer image: \"{username}/feature-store-initializer:latest\" volumeMounts: - mountPath: /mnt name: feature-store-volume containers: - name: feature-server image: \"{username}/feast-server:latest\" args: [ -c, /mnt/driver_feature_repo/feature_repo, serve, -h, 0.0.0.0 ] ports: - name: feature-server containerPort: 6566 resources: requests: memory: \"64Mi\" cpu: \"250m\" limits: memory: \"128Mi\" cpu: \"500m\" volumeMounts: - mountPath: /mnt name: feature-store-volume volumes: - name: feature-store-volume emptyDir: sizeLimit: 100Mi --- apiVersion: v1 kind: Service metadata: name: feature-server-service spec: type: LoadBalancer selector: app: feature-server ports: - protocol: TCP port: 6566 targetPort: 6566 EOF Expected output $ deployment.apps/feature-server created $ service/feature-server-service created","title":"Deploy Feast server"},{"location":"modelserving/v1beta1/transformer/feast/#create-a-transformer-with-feast","text":"","title":"Create a Transformer with Feast"},{"location":"modelserving/v1beta1/transformer/feast/#extend-the-model-class-and-implement-prepost-processing-functions","text":"KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, when predictor_host is passed the predict handler by default makes a HTTP call to the predictor url and gets back a response which then passes to postproces handler. KServe automatically fills in the predictor_host for Transformer and handle the call to the Predictor , for gRPC predictor currently you would need to overwrite the predict handler to make the gRPC call. To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. We created a class, DriverTransformer, which extends Model for this driver ranking example. It takes additional arguments for the transformer to interact with Feast: feast_serving_url : The Feast serving URL, in the form of <host_name:port> or <ip:port> entity_id_name : The name of the entity ID for which to retrieve features from the Feast feature store feature_refs : The feature references for the features to be retrieved","title":"Extend the Model class and implement pre/post processing functions"},{"location":"modelserving/v1beta1/transformer/feast/#build-transformer-docker-image","text":"The driver transformer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /driver-transformer:latest -f driver_transformer.Dockerfile . docker push $USERNAME /driver-transformer:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/transformer/feast/#create-the-inferenceservice","text":"In the Feast Transformer image we packaged the driver transformer class so KServe knows to use the preprocess implementation to augment inputs with online features before making model inference requests. Then the InferenceService uses SKLearn to serve the driver ranking model , which is trained with Feast offline features, available in a gcs bucket specified under storageUri . Update the container's image field and the feast_serving_url argument to create the InferenceService , which includes a Feast Transformer and a SKLearn Predictor. New Schema Old Schema cat <<EOF | kubectl apply -f - apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-driver-transformer\" spec: transformer: containers: - image: \"kserve/driver-transformer:latest\" name: driver-container command: - \"python\" - \"-m\" - \"driver_transformer\" args: - --feast_serving_url - \"feature-server-service.default.svc.cluster.local:6566\" - --entity_id_name - \"driver_id\" - --feature_refs - \"driver_hourly_stats:conv_rate\" - \"driver_hourly_stats:acc_rate\" - \"driver_hourly_stats:avg_daily_trips\" predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/feast/driver\" EOF cat <<EOF | kubectl apply -f - apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-driver-transformer\" spec: transformer: containers: - image: \"kserve/driver-transformer:latest\" name: driver-container command: - \"python\" - \"-m\" - \"driver_transformer\" args: - --feast_serving_url - \"feature-server-service.default.svc.cluster.local:6566\" - --entity_id_name - \"driver_id\" - --feature_refs - \"driver_hourly_stats:conv_rate\" - \"driver_hourly_stats:acc_rate\" - \"driver_hourly_stats:avg_daily_trips\" predictor: sklearn: storageUri: \"gs://kfserving-examples/models/feast/driver\" EOF Expected output $ inferenceservice.serving.kserve.io/sklearn-driver-transformer created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/transformer/feast/#run-a-prediction","text":"Prepare the inputs for the inference request. Copy the following Json into a file named driver-input.json . { \"instances\" : [[ 1001 ], [ 1002 ], [ 1003 ], [ 1004 ], [ 1005 ]] } Before testing the InferenceService , first check if it is in ready state. Now, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = sklearn-driver-transformer MODEL_NAME = sklearn-driver-transformer INPUT_PATH = @./driver-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output > POST /v1/models/sklearn-driver-transformer:predict HTTP/1.1 > Host: sklearn-driver-transformer.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Length: 57 > Content-Type: application/x-www-form-urlencoded > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 115 < content-type: application/json < date: Thu, 30 Mar 2023 09 :46:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 112 < * Connection #0 to host 1.2.3.4 left intact { \"predictions\" : [ 0 .45905828209879473,1.5118208033011165,0.21514156911776539,0.5555778492605103,0.49638665080127176 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/","text":"Deploy Transformer with InferenceService \u00b6 Transformer is an InferenceService component which does pre/post processing alongside with model inference. It usually takes raw input and transforms them to the input tensors model server expects. In this example we demonstrate an example of running inference with a custom Transformer communicating by REST and gRPC protocol. Create Custom Image Transformer \u00b6 Implement pre/post processing with KServe Model API \u00b6 KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence where the output of the preprocess handler is passed to the predict handler as the input. When predictor_host is passed, the predict handler makes a call to the predictor and gets back a response which is then passed to the postprocess handler. KServe automatically fills in the predictor_host for Transformer and hands over the call to the Predictor . By default transformer makes a REST call to predictor, to make a gRPC call to predictor, you can pass the --protocol argument with value grpc-v2 . To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. import argparse from kserve import Model , ModelServer , model_server , InferInput , InferRequest , logging from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import base64 import kserve def image_transform ( byte_array ): \"\"\"converts the input image of Bytes Array into Tensor Args: instance (dict): The request input for image bytes. Returns: list: Returns converted tensor as input for predict handler with v1/v2 inference protocol. \"\"\" image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor # for v1 REST predictor the preprocess handler converts to input image bytes to float tensor dict in v1 inference REST protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . ready = True def preprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return { 'instances' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]]} def postprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return inputs # for v2 gRPC predictor the preprocess handler converts the input image bytes tensor to float tensor in v2 inference protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol self . ready = True def preprocess ( self , request : InferRequest , headers : Dict [ str , str ] = None ) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request Please see the code example here . Transformer Server Entrypoint \u00b6 For single model you just create a transformer object and register that to the model server. if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) # Configure kserve and uvicorn logger model = ImageTransformer ( args . model_name , predictor_host = args . predictor_host , protocol = args . protocol ) ModelServer () . start ( models = [ model ]) For multi-model case if all the models can share the same transformer you can register the same transformer for different models, or different transformers if each model requires its own transformation. if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) # Configure kserve and uvicorn logger for model_name in model_names : transformer = ImageTransformer ( model_name , predictor_host = args . predictor_host ) models . append ( transformer ) kserve . ModelServer () . start ( models = models ) Configuring Logger for Serving Runtime \u00b6 Kserve allows users to override the default logger configuration of serving runtime and uvicorn server. You can follow the logger configuration documentation to configure the logger. Build Transformer docker image \u00b6 Under kserve/python directory, build the transformer docker image using Dockerfile cd python docker build -t $DOCKER_USER /image-transformer:latest -f custom_transformer.Dockerfile . docker push { username } /image-transformer:latest Deploy the InferenceService with REST Predictor \u00b6 Create the InferenceService \u00b6 By default InferenceService uses TorchServe to serve the PyTorch models and the models can be loaded from a model repository in cloud storage according to TorchServe model repository layout. In this example, the model repository contains a MNIST model, but you can store more than one model there. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Note STORAGE_URI is a build-in environment variable used to inject the storage initializer for custom container just like StorageURI field for prepackaged predictors. The downloaded artifacts are stored under /mnt/models . Apply the InferenceService transformer-new.yaml kubectl apply -f transformer-new.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transformer created Run a prediction \u00b6 First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_NAME = torch-transformer MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output > POST /v1/models/mnist:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 401 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 401 out of 401 bytes Handling connection for 8080 * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 20 < content-type: application/json ; charset = UTF-8 < date: Tue, 12 Jan 2021 09 :52:30 GMT < server: istio-envoy < x-envoy-upstream-service-time: 83 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]} Deploy the InferenceService calling Predictor with gRPC protocol \u00b6 Comparing with REST, gRPC is faster due to the tight packing of the Protocol Buffer and the use of HTTP/2 by gRPC. In many cases, gRPC can be more efficient communication protocol between Transformer and Predictor as you may need to transmit large tensors between them. Create InferenceService \u00b6 Create the InferenceService with following yaml which includes a Transformer and a Triton Predictor. As KServe by default uses TorchServe serving runtime for PyTorch model, here you need to override the serving runtime to kserve-tritonserver for using the gRPC protocol. The transformer calls out to predictor with V2 gRPC Protocol by specifying the --protocol argument. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchscript runtime : kserve-tritonserver runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 Apply the InferenceService grpc_transformer.yaml kubectl apply -f grpc_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-grpc-transformer created Run a prediction \u00b6 First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = torch-grpc-transformer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output * Trying ::1... * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 3394 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > Handling connection for 8080 < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json ; charset = UTF-8 < date: Thu, 03 Feb 2022 01 :50:07 GMT < server: istio-envoy < x-envoy-upstream-service-time: 73 < * Connection #0 to host localhost left intact { \"predictions\" : [[ -1.192867636680603, -0.35750141739845276, -2.3665435314178467, 3 .9186441898345947, -2.0592284202575684, 4 .091977119445801, 0 .1266237050294876, -1.8284690380096436, 2 .628898859024048, -4.255198001861572 ]]} * Closing connection 0 Performance Comparison between gRPC and REST \u00b6 From the following latency stats of both transformer and predictor you can see that the transformer to predictor call takes longer time(92ms vs 55ms) for REST than gRPC, REST takes more time serializing and deserializing 3*32*32 shape tensor and with gRPC it is transmitted as tightly packed numpy array serialized bytes. # from REST v1 transformer log 2023 -01-09 07 :15:55.263 79476 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 6 .083965302, explain_ms: 0 , predict_ms: 92 .653036118, postprocess_ms: 0 .007867813 # from REST v1 predictor log 2023 -01-09 07 :16:02.581 79402 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 13 .532876968, explain_ms: 0 , predict_ms: 48 .450231552, postprocess_ms: 0 .006914139 # from REST v1 transformer log 2023 -01-09 07 :27:52.172 79715 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 2 .567052841, explain_ms: 0 , predict_ms: 55 .0532341, postprocess_ms: 0 .101804733 # from gPPC v2 predictor log 2023 -01-09 07 :27:52.171 79711 root INFO [ __call__ () :128 ] requestId: , preprocess_ms: 0 .067949295, explain_ms: 0 , predict_ms: 51 .237106323, postprocess_ms: 0 .049114227","title":"How to write a custom transformer"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-transformer-with-inferenceservice","text":"Transformer is an InferenceService component which does pre/post processing alongside with model inference. It usually takes raw input and transforms them to the input tensors model server expects. In this example we demonstrate an example of running inference with a custom Transformer communicating by REST and gRPC protocol.","title":"Deploy Transformer with InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-custom-image-transformer","text":"","title":"Create Custom Image Transformer"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#implement-prepost-processing-with-kserve-model-api","text":"KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence where the output of the preprocess handler is passed to the predict handler as the input. When predictor_host is passed, the predict handler makes a call to the predictor and gets back a response which is then passed to the postprocess handler. KServe automatically fills in the predictor_host for Transformer and hands over the call to the Predictor . By default transformer makes a REST call to predictor, to make a gRPC call to predictor, you can pass the --protocol argument with value grpc-v2 . To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. import argparse from kserve import Model , ModelServer , model_server , InferInput , InferRequest , logging from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import base64 import kserve def image_transform ( byte_array ): \"\"\"converts the input image of Bytes Array into Tensor Args: instance (dict): The request input for image bytes. Returns: list: Returns converted tensor as input for predict handler with v1/v2 inference protocol. \"\"\" image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor # for v1 REST predictor the preprocess handler converts to input image bytes to float tensor dict in v1 inference REST protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . ready = True def preprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return { 'instances' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]]} def postprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return inputs # for v2 gRPC predictor the preprocess handler converts the input image bytes tensor to float tensor in v2 inference protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol self . ready = True def preprocess ( self , request : InferRequest , headers : Dict [ str , str ] = None ) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request Please see the code example here .","title":"Implement pre/post processing with KServe Model API"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#transformer-server-entrypoint","text":"For single model you just create a transformer object and register that to the model server. if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) # Configure kserve and uvicorn logger model = ImageTransformer ( args . model_name , predictor_host = args . predictor_host , protocol = args . protocol ) ModelServer () . start ( models = [ model ]) For multi-model case if all the models can share the same transformer you can register the same transformer for different models, or different transformers if each model requires its own transformation. if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) # Configure kserve and uvicorn logger for model_name in model_names : transformer = ImageTransformer ( model_name , predictor_host = args . predictor_host ) models . append ( transformer ) kserve . ModelServer () . start ( models = models )","title":"Transformer Server Entrypoint"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#configuring-logger-for-serving-runtime","text":"Kserve allows users to override the default logger configuration of serving runtime and uvicorn server. You can follow the logger configuration documentation to configure the logger.","title":"Configuring Logger for Serving Runtime"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#build-transformer-docker-image","text":"Under kserve/python directory, build the transformer docker image using Dockerfile cd python docker build -t $DOCKER_USER /image-transformer:latest -f custom_transformer.Dockerfile . docker push { username } /image-transformer:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-the-inferenceservice-with-rest-predictor","text":"","title":"Deploy the InferenceService with REST Predictor"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-the-inferenceservice","text":"By default InferenceService uses TorchServe to serve the PyTorch models and the models can be loaded from a model repository in cloud storage according to TorchServe model repository layout. In this example, the model repository contains a MNIST model, but you can store more than one model there. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Note STORAGE_URI is a build-in environment variable used to inject the storage initializer for custom container just like StorageURI field for prepackaged predictors. The downloaded artifacts are stored under /mnt/models . Apply the InferenceService transformer-new.yaml kubectl apply -f transformer-new.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transformer created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#run-a-prediction","text":"First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_NAME = torch-transformer MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output > POST /v1/models/mnist:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 401 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 401 out of 401 bytes Handling connection for 8080 * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 20 < content-type: application/json ; charset = UTF-8 < date: Tue, 12 Jan 2021 09 :52:30 GMT < server: istio-envoy < x-envoy-upstream-service-time: 83 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-the-inferenceservice-calling-predictor-with-grpc-protocol","text":"Comparing with REST, gRPC is faster due to the tight packing of the Protocol Buffer and the use of HTTP/2 by gRPC. In many cases, gRPC can be more efficient communication protocol between Transformer and Predictor as you may need to transmit large tensors between them.","title":"Deploy the InferenceService calling Predictor with gRPC protocol"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-inferenceservice","text":"Create the InferenceService with following yaml which includes a Transformer and a Triton Predictor. As KServe by default uses TorchServe serving runtime for PyTorch model, here you need to override the serving runtime to kserve-tritonserver for using the gRPC protocol. The transformer calls out to predictor with V2 gRPC Protocol by specifying the --protocol argument. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchscript runtime : kserve-tritonserver runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 Apply the InferenceService grpc_transformer.yaml kubectl apply -f grpc_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-grpc-transformer created","title":"Create InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#run-a-prediction_1","text":"First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = torch-grpc-transformer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output * Trying ::1... * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 3394 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > Handling connection for 8080 < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json ; charset = UTF-8 < date: Thu, 03 Feb 2022 01 :50:07 GMT < server: istio-envoy < x-envoy-upstream-service-time: 73 < * Connection #0 to host localhost left intact { \"predictions\" : [[ -1.192867636680603, -0.35750141739845276, -2.3665435314178467, 3 .9186441898345947, -2.0592284202575684, 4 .091977119445801, 0 .1266237050294876, -1.8284690380096436, 2 .628898859024048, -4.255198001861572 ]]} * Closing connection 0","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#performance-comparison-between-grpc-and-rest","text":"From the following latency stats of both transformer and predictor you can see that the transformer to predictor call takes longer time(92ms vs 55ms) for REST than gRPC, REST takes more time serializing and deserializing 3*32*32 shape tensor and with gRPC it is transmitted as tightly packed numpy array serialized bytes. # from REST v1 transformer log 2023 -01-09 07 :15:55.263 79476 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 6 .083965302, explain_ms: 0 , predict_ms: 92 .653036118, postprocess_ms: 0 .007867813 # from REST v1 predictor log 2023 -01-09 07 :16:02.581 79402 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 13 .532876968, explain_ms: 0 , predict_ms: 48 .450231552, postprocess_ms: 0 .006914139 # from REST v1 transformer log 2023 -01-09 07 :27:52.172 79715 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 2 .567052841, explain_ms: 0 , predict_ms: 55 .0532341, postprocess_ms: 0 .101804733 # from gPPC v2 predictor log 2023 -01-09 07 :27:52.171 79711 root INFO [ __call__ () :128 ] requestId: , preprocess_ms: 0 .067949295, explain_ms: 0 , predict_ms: 51 .237106323, postprocess_ms: 0 .049114227","title":"Performance Comparison between gRPC and REST"},{"location":"modelserving/v1beta1/triton/bert/","text":"QA Inference with BERT model using Triton Inference Server \u00b6 Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This example demonstrates Inference on Question Answering (QA) task with BERT Base/Large model The use of fine-tuned NVIDIA BERT models Deploy Transformer for preprocess using BERT tokenizer Deploy BERT model on Triton Inference Server Inference with V2 KServe protocol We can run inference on a fine-tuned BERT model for tasks like Question Answering. Here we use a BERT model fine-tuned on a SQuaD 2.0 Dataset which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions. Setup \u00b6 Your cluster's Istio Ingress gateway must be network accessible . Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving Create Custom Transformer for BERT Tokenizer \u00b6 Extend ModelServer base and Implement pre/postprocess \u00b6 The preprocess handler converts the paragraph and the question to BERT input using BERT tokenizer The predict handler calls Triton Inference Server using PYTHON REST API The postprocess handler converts raw prediction to the answer with the probability class BertTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str ): super () . __init__ ( name ) self . short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\" self . predictor_host = predictor_host self . tokenizer = tokenization . FullTokenizer ( vocab_file = \"/mnt/models/vocab.txt\" , do_lower_case = True ) self . model_name = \"bert_tf_v2_large_fp16_128_v2\" self . triton_client = None def preprocess ( self , inputs : Dict ) -> Dict : self . doc_tokens = data_processing . convert_doc_tokens ( self . short_paragraph_text ) self . features = data_processing . convert_examples_to_features ( self . doc_tokens , inputs [ \"instances\" ][ 0 ], self . tokenizer , 128 , 128 , 64 ) return self . features def predict ( self , features : Dict ) -> Dict : if not self . triton_client : self . triton_client = httpclient . InferenceServerClient ( url = self . predictor_host , verbose = True ) unique_ids = np . zeros ([ 1 , 1 ], dtype = np . int32 ) segment_ids = features [ \"segment_ids\" ] . reshape ( 1 , 128 ) input_ids = features [ \"input_ids\" ] . reshape ( 1 , 128 ) input_mask = features [ \"input_mask\" ] . reshape ( 1 , 128 ) inputs = [] inputs . append ( httpclient . InferInput ( 'unique_ids' , [ 1 , 1 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'segment_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_mask' , [ 1 , 128 ], \"INT32\" )) inputs [ 0 ] . set_data_from_numpy ( unique_ids ) inputs [ 1 ] . set_data_from_numpy ( segment_ids ) inputs [ 2 ] . set_data_from_numpy ( input_ids ) inputs [ 3 ] . set_data_from_numpy ( input_mask ) outputs = [] outputs . append ( httpclient . InferRequestedOutput ( 'start_logits' , binary_data = False )) outputs . append ( httpclient . InferRequestedOutput ( 'end_logits' , binary_data = False )) result = self . triton_client . infer ( self . model_name , inputs , outputs = outputs ) return result . get_response () def postprocess ( self , result : Dict ) -> Dict : end_logits = result [ 'outputs' ][ 0 ][ 'data' ] start_logits = result [ 'outputs' ][ 1 ][ 'data' ] n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 ( prediction , nbest_json , scores_diff_json ) = \\ data_processing . get_predictions ( self . doc_tokens , self . features , start_logits , end_logits , n_best_size , max_answer_length ) return { \"predictions\" : prediction , \"prob\" : nbest_json [ 0 ][ 'probability' ] * 100.0 } Please find the code example here . Build Transformer docker image \u00b6 Build the KServe Transformer image with above code cd bert_tokenizer_v2 docker build -t $USER /bert_transformer-v2:latest . --rm Or you can use the prebuild image kfserving/bert-transformer-v2:latest Create the InferenceService \u00b6 Add above custom KServe Transformer image and Triton Predictor to the InferenceService spec apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"bert-v2\" spec : transformer : containers : - name : kserve-container image : kfserving/bert-transformer-v2:latest command : - \"python\" - \"-m\" - \"bert_transformer_v2\" env : - name : STORAGE_URI value : \"gs://kfserving-examples/models/triton/bert-transformer\" predictor : triton : runtimeVersion : 20.10-py3 resources : limits : cpu : \"1\" memory : 8Gi requests : cpu : \"1\" memory : 8Gi storageUri : \"gs://kfserving-examples/models/triton/bert\" Apply the InferenceService yaml. kubectl apply -f bert_v1beta1.yaml Expected Output $ inferenceservice.serving.kserve.io/bert-v2 created Check the InferenceService \u00b6 kubectl get inferenceservice bert-v2 NAME URL READY AGE bert-v2 http://bert-v2.default.35.229.120.99.xip.io True 71s you will see both transformer and predictor are created and in ready state kubectl get revision -l serving.kserve.io/inferenceservice = bert-v2 NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON bert-v2-predictor-default-plhgs bert-v2-predictor-default bert-v2-predictor-default-plhgs 1 True bert-v2-transformer-default-sd6nc bert-v2-transformer-default bert-v2-transformer-default-sd6nc 1 True Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send a question request with following input, the transformer expects sending a list of instances or inputs and preprocess then converts the inputs to expected tensor sending to Triton Inference Server . { \"instances\" : [ \"What President is credited with the original notion of putting Americans in space?\" ] } MODEL_NAME = bert-v2 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservices bert-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output { \"predictions\" : \"John F. Kennedy\" , \"prob\" : 77 .91848979818604 }","title":"Tensorflow"},{"location":"modelserving/v1beta1/triton/bert/#qa-inference-with-bert-model-using-triton-inference-server","text":"Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This example demonstrates Inference on Question Answering (QA) task with BERT Base/Large model The use of fine-tuned NVIDIA BERT models Deploy Transformer for preprocess using BERT tokenizer Deploy BERT model on Triton Inference Server Inference with V2 KServe protocol We can run inference on a fine-tuned BERT model for tasks like Question Answering. Here we use a BERT model fine-tuned on a SQuaD 2.0 Dataset which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions.","title":"QA Inference with BERT model using Triton Inference Server"},{"location":"modelserving/v1beta1/triton/bert/#setup","text":"Your cluster's Istio Ingress gateway must be network accessible . Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving","title":"Setup"},{"location":"modelserving/v1beta1/triton/bert/#create-custom-transformer-for-bert-tokenizer","text":"","title":"Create Custom Transformer for BERT Tokenizer"},{"location":"modelserving/v1beta1/triton/bert/#extend-modelserver-base-and-implement-prepostprocess","text":"The preprocess handler converts the paragraph and the question to BERT input using BERT tokenizer The predict handler calls Triton Inference Server using PYTHON REST API The postprocess handler converts raw prediction to the answer with the probability class BertTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str ): super () . __init__ ( name ) self . short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\" self . predictor_host = predictor_host self . tokenizer = tokenization . FullTokenizer ( vocab_file = \"/mnt/models/vocab.txt\" , do_lower_case = True ) self . model_name = \"bert_tf_v2_large_fp16_128_v2\" self . triton_client = None def preprocess ( self , inputs : Dict ) -> Dict : self . doc_tokens = data_processing . convert_doc_tokens ( self . short_paragraph_text ) self . features = data_processing . convert_examples_to_features ( self . doc_tokens , inputs [ \"instances\" ][ 0 ], self . tokenizer , 128 , 128 , 64 ) return self . features def predict ( self , features : Dict ) -> Dict : if not self . triton_client : self . triton_client = httpclient . InferenceServerClient ( url = self . predictor_host , verbose = True ) unique_ids = np . zeros ([ 1 , 1 ], dtype = np . int32 ) segment_ids = features [ \"segment_ids\" ] . reshape ( 1 , 128 ) input_ids = features [ \"input_ids\" ] . reshape ( 1 , 128 ) input_mask = features [ \"input_mask\" ] . reshape ( 1 , 128 ) inputs = [] inputs . append ( httpclient . InferInput ( 'unique_ids' , [ 1 , 1 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'segment_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_mask' , [ 1 , 128 ], \"INT32\" )) inputs [ 0 ] . set_data_from_numpy ( unique_ids ) inputs [ 1 ] . set_data_from_numpy ( segment_ids ) inputs [ 2 ] . set_data_from_numpy ( input_ids ) inputs [ 3 ] . set_data_from_numpy ( input_mask ) outputs = [] outputs . append ( httpclient . InferRequestedOutput ( 'start_logits' , binary_data = False )) outputs . append ( httpclient . InferRequestedOutput ( 'end_logits' , binary_data = False )) result = self . triton_client . infer ( self . model_name , inputs , outputs = outputs ) return result . get_response () def postprocess ( self , result : Dict ) -> Dict : end_logits = result [ 'outputs' ][ 0 ][ 'data' ] start_logits = result [ 'outputs' ][ 1 ][ 'data' ] n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 ( prediction , nbest_json , scores_diff_json ) = \\ data_processing . get_predictions ( self . doc_tokens , self . features , start_logits , end_logits , n_best_size , max_answer_length ) return { \"predictions\" : prediction , \"prob\" : nbest_json [ 0 ][ 'probability' ] * 100.0 } Please find the code example here .","title":"Extend ModelServer base and Implement pre/postprocess"},{"location":"modelserving/v1beta1/triton/bert/#build-transformer-docker-image","text":"Build the KServe Transformer image with above code cd bert_tokenizer_v2 docker build -t $USER /bert_transformer-v2:latest . --rm Or you can use the prebuild image kfserving/bert-transformer-v2:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/triton/bert/#create-the-inferenceservice","text":"Add above custom KServe Transformer image and Triton Predictor to the InferenceService spec apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"bert-v2\" spec : transformer : containers : - name : kserve-container image : kfserving/bert-transformer-v2:latest command : - \"python\" - \"-m\" - \"bert_transformer_v2\" env : - name : STORAGE_URI value : \"gs://kfserving-examples/models/triton/bert-transformer\" predictor : triton : runtimeVersion : 20.10-py3 resources : limits : cpu : \"1\" memory : 8Gi requests : cpu : \"1\" memory : 8Gi storageUri : \"gs://kfserving-examples/models/triton/bert\" Apply the InferenceService yaml. kubectl apply -f bert_v1beta1.yaml Expected Output $ inferenceservice.serving.kserve.io/bert-v2 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/bert/#check-the-inferenceservice","text":"kubectl get inferenceservice bert-v2 NAME URL READY AGE bert-v2 http://bert-v2.default.35.229.120.99.xip.io True 71s you will see both transformer and predictor are created and in ready state kubectl get revision -l serving.kserve.io/inferenceservice = bert-v2 NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON bert-v2-predictor-default-plhgs bert-v2-predictor-default bert-v2-predictor-default-plhgs 1 True bert-v2-transformer-default-sd6nc bert-v2-transformer-default bert-v2-transformer-default-sd6nc 1 True","title":"Check the InferenceService"},{"location":"modelserving/v1beta1/triton/bert/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send a question request with following input, the transformer expects sending a list of instances or inputs and preprocess then converts the inputs to expected tensor sending to Triton Inference Server . { \"instances\" : [ \"What President is credited with the original notion of putting Americans in space?\" ] } MODEL_NAME = bert-v2 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservices bert-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output { \"predictions\" : \"John F. Kennedy\" , \"prob\" : 77 .91848979818604 }","title":"Run a Prediction"},{"location":"modelserving/v1beta1/triton/huggingface/","text":"Serve the Huggingface model using Triton Inference Runtime \u00b6 Nvidia Triton Inference Server is a robust serving runtime thanks to its optmized performance, scalability, and flexibility. Combined with the expansive library of Hugging Face, which offers state-of-the-art natural language processing capabilities, it opens up immense possibilities for deploying production-ready Huggface Face transformer based models. By harnessing the power of these tools, here we'll show you how KServe can help further simplify the Triton Inference containers deployment and make efficient use of GPUs by automatically wiring up the open inference protocol between pre/post processing(tokenization) and model inference on triton inference container. Export the Model to Triton format \u00b6 Export the Hugging Face models to supported model formats Torchscript or ONNX in triton model repository layout . For more details, please refer to triton model configuration . Deploy InferenceService with Triton and Hugging Face Runtime \u00b6 Create an InferenceService with triton predictor by specifying the storageUri with the Hugging Face model stored on cloud storage according to triton model repository layout. The KServe transformer container is created using the KServe Hugging Face runtime for the tokenization step to encode the text tokens and decode the token ids from the output the triton inference container. The Hugging Face tokenizing container and triton inference container can communicate with either REST or gRPC protocol by specifiying the --predictor_protocol=v2 or --predictor_protocol=grpc-v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-triton spec : predictor : model : args : - --log-verbose=1 modelFormat : name : triton protocolVersion : v2 resources : limits : cpu : \"1\" memory : 8Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 8Gi nvidia.com/gpu : \"1\" runtimeVersion : 23.10-py3 storageUri : gs://kfserving-examples/models/triton/huggingface/model_repository transformer : containers : - args : - --model_name=bert - --model_id=bert-base-uncased - --predictor_protocol=v2 - --tensor_input_names=input_ids image : kserve/huggingfaceserver:v0.13.0 name : kserve-container resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : 100m memory : 2Gi EOF Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-triton -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Perform inference using v1 REST Protocol curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d '{\"instances\": [\"The capital of france is [MASK].\"] }' Expected Output { \"predictions\" : [ \"paris\" ]}","title":"Hugging Face"},{"location":"modelserving/v1beta1/triton/huggingface/#serve-the-huggingface-model-using-triton-inference-runtime","text":"Nvidia Triton Inference Server is a robust serving runtime thanks to its optmized performance, scalability, and flexibility. Combined with the expansive library of Hugging Face, which offers state-of-the-art natural language processing capabilities, it opens up immense possibilities for deploying production-ready Huggface Face transformer based models. By harnessing the power of these tools, here we'll show you how KServe can help further simplify the Triton Inference containers deployment and make efficient use of GPUs by automatically wiring up the open inference protocol between pre/post processing(tokenization) and model inference on triton inference container.","title":"Serve the Huggingface model using Triton Inference Runtime"},{"location":"modelserving/v1beta1/triton/huggingface/#export-the-model-to-triton-format","text":"Export the Hugging Face models to supported model formats Torchscript or ONNX in triton model repository layout . For more details, please refer to triton model configuration .","title":"Export the Model to Triton format"},{"location":"modelserving/v1beta1/triton/huggingface/#deploy-inferenceservice-with-triton-and-hugging-face-runtime","text":"Create an InferenceService with triton predictor by specifying the storageUri with the Hugging Face model stored on cloud storage according to triton model repository layout. The KServe transformer container is created using the KServe Hugging Face runtime for the tokenization step to encode the text tokens and decode the token ids from the output the triton inference container. The Hugging Face tokenizing container and triton inference container can communicate with either REST or gRPC protocol by specifiying the --predictor_protocol=v2 or --predictor_protocol=grpc-v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-triton spec : predictor : model : args : - --log-verbose=1 modelFormat : name : triton protocolVersion : v2 resources : limits : cpu : \"1\" memory : 8Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 8Gi nvidia.com/gpu : \"1\" runtimeVersion : 23.10-py3 storageUri : gs://kfserving-examples/models/triton/huggingface/model_repository transformer : containers : - args : - --model_name=bert - --model_id=bert-base-uncased - --predictor_protocol=v2 - --tensor_input_names=input_ids image : kserve/huggingfaceserver:v0.13.0 name : kserve-container resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : 100m memory : 2Gi EOF","title":"Deploy InferenceService with Triton and Hugging Face Runtime"},{"location":"modelserving/v1beta1/triton/huggingface/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-triton -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Perform inference using v1 REST Protocol curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d '{\"instances\": [\"The capital of france is [MASK].\"] }' Expected Output { \"predictions\" : [ \"paris\" ]}","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/triton/torchscript/","text":"Predict on a Triton InferenceService with TorchScript model \u00b6 While Python is a suitable and preferred language for many scenarios requiring dynamism and ease of iteration, there are equally many situations where precisely these properties of Python are unfavorable. One environment in which the latter often applies is production \u2013 the land of low latencies and strict deployment requirements. For production scenarios, C++ is very often the language of choice, The following example will outline the path PyTorch provides to go from an existing Python model to a serialized representation that can be loaded and executed purely from C++ like Triton Inference Server, with no dependency on Python. Setup \u00b6 Make sure you have installed KServe Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving Export as Torchscript Model \u00b6 A PyTorch model\u2019s journey from Python to C++ is enabled by Torch Script , a representation of a PyTorch model that can be understood, compiled and serialized by the Torch Script compiler. If you are starting out from an existing PyTorch model written in the vanilla eager API, you must first convert your model to Torch Script. Convert the above model via Tracing and serialize the script module to a file import torch # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. example = torch . rand ( 1 , 3 , 32 , 32 ) traced_script_module = torch . jit . trace ( net , example ) traced_script_module . save ( \"model.pt\" ) Store your trained model on cloud storage in a Model Repository \u00b6 Once the model is exported as TorchScript model file, the next step is to upload the model to a GCS bucket. Triton supports loading multiple models so it expects a model repository which follows a required layout in the bucket. <model-repository-path>/ <model-name>/ [config.pbtxt] [<output-labels-file> ...] <version>/ <model-definition-file> <version>/ <model-definition-file> ... <model-name>/ [config.pbtxt] [<output-labels-file> ...] <version>/ <model-definition-file> <version>/ <model-definition-file> For example in your model repository bucket gs://kfserving-examples/models/torchscript , the layout can be torchscript/ cifar/ config.pbtxt 1/ model.pt The config.pbtxt defines a model configuration that provides the required and optional information for the model. A minimal model configuration must specify name, platform, max_batch_size, input, and output. Due to the absence of names for inputs and outputs in a TorchScript model, the name attribute of both the inputs and outputs in the configuration must follow a specific naming convention i.e. \u201c __ \u201d. Where can be any string and refers to the position of the corresponding input/output. This means if there are two inputs and two outputs they must be named as: INPUT__0 , INPUT__1 and OUTPUT__0 , OUTPUT__1 such that INPUT__0 refers to first input and INPUT__1 refers to the second input, etc. na me : \"cifar\" pla tf orm : \"pytorch_libtorch\" max_ba t ch_size : 1 i n pu t [ { na me : \"INPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 3 , 32 , 32 ] } ] ou t pu t [ { na me : \"OUTPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 10 ] } ] i nstan ce_group [ { cou nt : 1 ki n d : KIND_CPU } ] instance_group provides multiple instances of a model so that multiple inference requests for that model can be handled simultaneously. instance_group [ { count: 4 kind: KIND_CPU } ] To schedule the model on GPU you would need to change the instance_group with GPU kind instance_group [ { count: 1 kind: KIND_GPU } ] For more details, please refer to triton model configuration . Inference with HTTP endpoint \u00b6 Create the InferenceService \u00b6 Create the inference service yaml with the above specified model repository uri. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" Warning Setting OMP_NUM_THREADS or MKL_NUM_THREADS envs are critical for performance, these environment variables are used to control the intra-op parallelism for TorchScript model inference, the number of CPU threads defaults to the number of CPU cores. Please refer to CPU threading & TorchScript Inference for more details. kubectl kubectl apply -f torchscript.yaml Expected Output $ inferenceservice.serving.kserve.io/torchscript-cifar10 created Run a prediction with curl \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT The latest Triton Inference Server already switched to use KServe prediction V2 protocol , so the input request needs to follow the V2 schema with the specified data type, shape. # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/input.json MODEL_NAME = cifar10 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d $INPUT_PATH Expected Output * Connected to torchscript-cifar.default.svc.cluster.local ( 10 .51.242.87 ) port 80 ( #0) > POST /v2/models/cifar10/infer HTTP/1.1 > Host: torchscript-cifar.default.svc.cluster.local > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 110765 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 315 < content-type: application/json < date: Sun, 11 Oct 2020 21 :26:51 GMT < x-envoy-upstream-service-time: 8 < server: istio-envoy < * Connection #0 to host torchscript-cifar.default.svc.cluster.local left intact { \"model_name\" : \"cifar10\" , \"model_version\" : \"1\" , \"outputs\" : [{ \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ 1 ,10 ] , \"data\" : [ -2.0964810848236086,-0.13700756430625916,-0.5095657706260681,2.795621395111084,-0.5605481863021851,1.9934231042861939,1.1288187503814698,-1.4043136835098267,0.6004879474639893,-2.1237082481384279 ]}]} Run a performance test \u00b6 QPS rate --rate can be changed in the perf.yaml . kubectl create -f perf.yaml Requests [ total, rate, throughput ] 6000 , 100 .02, 100 .01 Duration [ total, attack, wait ] 59 .995s, 59 .99s, 4 .961ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 4 .222ms, 5 .7ms, 5 .548ms, 6 .384ms, 6 .743ms, 9 .286ms, 25 .85ms Bytes In [ total, mean ] 1890000 , 315 .00 Bytes Out [ total, mean ] 665874000 , 110979 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :6000 Error Set: Inference with gRPC endpoint \u00b6 Create the InferenceService \u00b6 Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - containerPort : 9000 name : h2c protocol : TCP env : - name : OMP_NUM_THREADS value : \"1\" Apply the gRPC InferenceService yaml and then you can call the model with tritonclient python library after InferenceService is ready. kubectl apply -f torchscript_grpc.yaml Run a prediction with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/predict-api/v2/grpc_predict_v2.proto # download the input json file curl -O https://raw.githubusercontent.com/kserve/website/main/docs/modelserving/v1beta1/triton/torchscript/input-grpc.json INPUT_PATH = input-grpc.json PROTO_FILE = grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The gRPC APIs follow the KServe prediction V2 protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \" \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: host: torchscript-cifar10.default.example.com Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Fri, 12 Aug 2022 01 :49:53 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 16 Response contents: { \"modelName\" : \"cifar10\" , \"modelVersion\" : \"1\" , \"outputs\" : [ { \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"10\" ] } ] , \"rawOutputContents\" : [ \"wCwGwOJLDL7icgK/dusyQAqAD799KP8/In2QP4zAs7+WuRk/2OoHwA==\" ] } Response trailers received: ( empty ) Sent 1 request and received 1 response The content of output tensor is encoded in rawOutputContents field. It can be base64 decoded and loaded into a Numpy array with the given datatype and shape. Alternatively, Triton also provides Python client library which has many examples showing how to interact with the KServe V2 gPRC protocol. Add Transformer to the InferenceService \u00b6 Triton Inference Server expects tensors as input data, often times a pre-processing step is required before making the prediction call when the user is sending in request with raw input format. Transformer component can be specified on InferenceService spec for user implemented pre/post processing code. User is responsible to create a python class which extends from KServe Model base class which implements preprocess handler to transform raw input format to tensor format according to V2 prediction protocol, postprocess handle is to convert raw prediction response to a more user friendly response. Implement pre/post processing functions \u00b6 image_transformer_v2.py import kserve from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import numpy as np import base64 logging . basicConfig ( level = kserve . constants . KSERVE_LOGLEVEL ) transform = transforms . Compose ( [ transforms . ToTensor (), transforms . Normalize (( 0.5 , 0.5 , 0.5 ), ( 0.5 , 0.5 , 0.5 ))]) def image_transform ( instance ): byte_array = base64 . b64decode ( instance [ 'image_bytes' ][ 'b64' ]) image = Image . open ( io . BytesIO ( byte_array )) a = np . asarray ( image ) im = Image . fromarray ( a ) res = transform ( im ) logging . info ( res ) return res . tolist () class ImageTransformerV2 ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol def preprocess ( self , inputs : Dict ) -> Dict : return { 'inputs' : [ { 'name' : 'INPUT__0' , 'shape' : [ 1 , 3 , 32 , 32 ], 'datatype' : \"FP32\" , 'data' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]] } ] } def postprocess ( self , results : Dict ) -> Dict : return { output [ \"name\" ]: np . array ( output [ \"data\" ]) . reshape ( output [ \"shape\" ]) . tolist () for output in results [ \"outputs\" ]} Please find the code example and Dockerfile . Build Transformer docker image \u00b6 docker build -t $DOCKER_USER /image-transformer-v2:latest -f transformer.Dockerfile . --rm Create the InferenceService with Transformer \u00b6 Please use the YAML file to create the InferenceService, which adds the image transformer component with the docker image built from above. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transfomer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" transformer : containers : - image : kfserving/image-transformer-v2:latest name : kserve-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 kubectl apply -f torch_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transfomer created Run a prediction with curl \u00b6 The transformer does not enforce a specific schema like predictor but the general recommendation is to send in as a list of object(dict): \"instances\": <value>|<list-of-objects> { \"instances\" : [ { \"image_bytes\" : { \"b64\" : \"aW1hZ2UgYnl0ZXM=\" }, \"caption\" : \"seaside\" }, { \"image_bytes\" : { \"b64\" : \"YXdlc29tZSBpbWFnZSBieXRlcw==\" }, \"caption\" : \"mountains\" } ] } # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/image.json SERVICE_NAME = torch-transfomer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.kserve-triton.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3400 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 219 < content-type: application/json ; charset = UTF-8 < date: Sat, 19 Mar 2022 12 :15:54 GMT < server: istio-envoy < x-envoy-upstream-service-time: 41 < { \"OUTPUT__0\" : [[ -2.0964810848236084, -0.137007474899292, -0.5095658302307129, 2 .795621395111084, -0.560547947883606, 1 .9934231042861938, 1 .1288189888000488, - 4043136835098267 , 0 .600488007068634, -2.1237082481384277 ]]} %","title":"Torchscript"},{"location":"modelserving/v1beta1/triton/torchscript/#predict-on-a-triton-inferenceservice-with-torchscript-model","text":"While Python is a suitable and preferred language for many scenarios requiring dynamism and ease of iteration, there are equally many situations where precisely these properties of Python are unfavorable. One environment in which the latter often applies is production \u2013 the land of low latencies and strict deployment requirements. For production scenarios, C++ is very often the language of choice, The following example will outline the path PyTorch provides to go from an existing Python model to a serialized representation that can be loaded and executed purely from C++ like Triton Inference Server, with no dependency on Python.","title":"Predict on a Triton InferenceService with TorchScript model"},{"location":"modelserving/v1beta1/triton/torchscript/#setup","text":"Make sure you have installed KServe Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving","title":"Setup"},{"location":"modelserving/v1beta1/triton/torchscript/#export-as-torchscript-model","text":"A PyTorch model\u2019s journey from Python to C++ is enabled by Torch Script , a representation of a PyTorch model that can be understood, compiled and serialized by the Torch Script compiler. If you are starting out from an existing PyTorch model written in the vanilla eager API, you must first convert your model to Torch Script. Convert the above model via Tracing and serialize the script module to a file import torch # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. example = torch . rand ( 1 , 3 , 32 , 32 ) traced_script_module = torch . jit . trace ( net , example ) traced_script_module . save ( \"model.pt\" )","title":"Export as Torchscript Model"},{"location":"modelserving/v1beta1/triton/torchscript/#store-your-trained-model-on-cloud-storage-in-a-model-repository","text":"Once the model is exported as TorchScript model file, the next step is to upload the model to a GCS bucket. Triton supports loading multiple models so it expects a model repository which follows a required layout in the bucket. <model-repository-path>/ <model-name>/ [config.pbtxt] [<output-labels-file> ...] <version>/ <model-definition-file> <version>/ <model-definition-file> ... <model-name>/ [config.pbtxt] [<output-labels-file> ...] <version>/ <model-definition-file> <version>/ <model-definition-file> For example in your model repository bucket gs://kfserving-examples/models/torchscript , the layout can be torchscript/ cifar/ config.pbtxt 1/ model.pt The config.pbtxt defines a model configuration that provides the required and optional information for the model. A minimal model configuration must specify name, platform, max_batch_size, input, and output. Due to the absence of names for inputs and outputs in a TorchScript model, the name attribute of both the inputs and outputs in the configuration must follow a specific naming convention i.e. \u201c __ \u201d. Where can be any string and refers to the position of the corresponding input/output. This means if there are two inputs and two outputs they must be named as: INPUT__0 , INPUT__1 and OUTPUT__0 , OUTPUT__1 such that INPUT__0 refers to first input and INPUT__1 refers to the second input, etc. na me : \"cifar\" pla tf orm : \"pytorch_libtorch\" max_ba t ch_size : 1 i n pu t [ { na me : \"INPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 3 , 32 , 32 ] } ] ou t pu t [ { na me : \"OUTPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 10 ] } ] i nstan ce_group [ { cou nt : 1 ki n d : KIND_CPU } ] instance_group provides multiple instances of a model so that multiple inference requests for that model can be handled simultaneously. instance_group [ { count: 4 kind: KIND_CPU } ] To schedule the model on GPU you would need to change the instance_group with GPU kind instance_group [ { count: 1 kind: KIND_GPU } ] For more details, please refer to triton model configuration .","title":"Store your trained model on cloud storage in a Model Repository"},{"location":"modelserving/v1beta1/triton/torchscript/#inference-with-http-endpoint","text":"","title":"Inference with HTTP endpoint"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice","text":"Create the inference service yaml with the above specified model repository uri. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" Warning Setting OMP_NUM_THREADS or MKL_NUM_THREADS envs are critical for performance, these environment variables are used to control the intra-op parallelism for TorchScript model inference, the number of CPU threads defaults to the number of CPU cores. Please refer to CPU threading & TorchScript Inference for more details. kubectl kubectl apply -f torchscript.yaml Expected Output $ inferenceservice.serving.kserve.io/torchscript-cifar10 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-curl","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT The latest Triton Inference Server already switched to use KServe prediction V2 protocol , so the input request needs to follow the V2 schema with the specified data type, shape. # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/input.json MODEL_NAME = cifar10 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d $INPUT_PATH Expected Output * Connected to torchscript-cifar.default.svc.cluster.local ( 10 .51.242.87 ) port 80 ( #0) > POST /v2/models/cifar10/infer HTTP/1.1 > Host: torchscript-cifar.default.svc.cluster.local > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 110765 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 315 < content-type: application/json < date: Sun, 11 Oct 2020 21 :26:51 GMT < x-envoy-upstream-service-time: 8 < server: istio-envoy < * Connection #0 to host torchscript-cifar.default.svc.cluster.local left intact { \"model_name\" : \"cifar10\" , \"model_version\" : \"1\" , \"outputs\" : [{ \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ 1 ,10 ] , \"data\" : [ -2.0964810848236086,-0.13700756430625916,-0.5095657706260681,2.795621395111084,-0.5605481863021851,1.9934231042861939,1.1288187503814698,-1.4043136835098267,0.6004879474639893,-2.1237082481384279 ]}]}","title":"Run a prediction with curl"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-performance-test","text":"QPS rate --rate can be changed in the perf.yaml . kubectl create -f perf.yaml Requests [ total, rate, throughput ] 6000 , 100 .02, 100 .01 Duration [ total, attack, wait ] 59 .995s, 59 .99s, 4 .961ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 4 .222ms, 5 .7ms, 5 .548ms, 6 .384ms, 6 .743ms, 9 .286ms, 25 .85ms Bytes In [ total, mean ] 1890000 , 315 .00 Bytes Out [ total, mean ] 665874000 , 110979 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :6000 Error Set:","title":"Run a performance test"},{"location":"modelserving/v1beta1/triton/torchscript/#inference-with-grpc-endpoint","text":"","title":"Inference with gRPC endpoint"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice_1","text":"Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - containerPort : 9000 name : h2c protocol : TCP env : - name : OMP_NUM_THREADS value : \"1\" Apply the gRPC InferenceService yaml and then you can call the model with tritonclient python library after InferenceService is ready. kubectl apply -f torchscript_grpc.yaml","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/predict-api/v2/grpc_predict_v2.proto # download the input json file curl -O https://raw.githubusercontent.com/kserve/website/main/docs/modelserving/v1beta1/triton/torchscript/input-grpc.json INPUT_PATH = input-grpc.json PROTO_FILE = grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The gRPC APIs follow the KServe prediction V2 protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \" \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: host: torchscript-cifar10.default.example.com Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Fri, 12 Aug 2022 01 :49:53 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 16 Response contents: { \"modelName\" : \"cifar10\" , \"modelVersion\" : \"1\" , \"outputs\" : [ { \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"10\" ] } ] , \"rawOutputContents\" : [ \"wCwGwOJLDL7icgK/dusyQAqAD799KP8/In2QP4zAs7+WuRk/2OoHwA==\" ] } Response trailers received: ( empty ) Sent 1 request and received 1 response The content of output tensor is encoded in rawOutputContents field. It can be base64 decoded and loaded into a Numpy array with the given datatype and shape. Alternatively, Triton also provides Python client library which has many examples showing how to interact with the KServe V2 gPRC protocol.","title":"Run a prediction with grpcurl"},{"location":"modelserving/v1beta1/triton/torchscript/#add-transformer-to-the-inferenceservice","text":"Triton Inference Server expects tensors as input data, often times a pre-processing step is required before making the prediction call when the user is sending in request with raw input format. Transformer component can be specified on InferenceService spec for user implemented pre/post processing code. User is responsible to create a python class which extends from KServe Model base class which implements preprocess handler to transform raw input format to tensor format according to V2 prediction protocol, postprocess handle is to convert raw prediction response to a more user friendly response.","title":"Add Transformer to the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#implement-prepost-processing-functions","text":"image_transformer_v2.py import kserve from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import numpy as np import base64 logging . basicConfig ( level = kserve . constants . KSERVE_LOGLEVEL ) transform = transforms . Compose ( [ transforms . ToTensor (), transforms . Normalize (( 0.5 , 0.5 , 0.5 ), ( 0.5 , 0.5 , 0.5 ))]) def image_transform ( instance ): byte_array = base64 . b64decode ( instance [ 'image_bytes' ][ 'b64' ]) image = Image . open ( io . BytesIO ( byte_array )) a = np . asarray ( image ) im = Image . fromarray ( a ) res = transform ( im ) logging . info ( res ) return res . tolist () class ImageTransformerV2 ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol def preprocess ( self , inputs : Dict ) -> Dict : return { 'inputs' : [ { 'name' : 'INPUT__0' , 'shape' : [ 1 , 3 , 32 , 32 ], 'datatype' : \"FP32\" , 'data' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]] } ] } def postprocess ( self , results : Dict ) -> Dict : return { output [ \"name\" ]: np . array ( output [ \"data\" ]) . reshape ( output [ \"shape\" ]) . tolist () for output in results [ \"outputs\" ]} Please find the code example and Dockerfile .","title":"Implement pre/post processing functions"},{"location":"modelserving/v1beta1/triton/torchscript/#build-transformer-docker-image","text":"docker build -t $DOCKER_USER /image-transformer-v2:latest -f transformer.Dockerfile . --rm","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice-with-transformer","text":"Please use the YAML file to create the InferenceService, which adds the image transformer component with the docker image built from above. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transfomer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" transformer : containers : - image : kfserving/image-transformer-v2:latest name : kserve-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 kubectl apply -f torch_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transfomer created","title":"Create the InferenceService with Transformer"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-curl_1","text":"The transformer does not enforce a specific schema like predictor but the general recommendation is to send in as a list of object(dict): \"instances\": <value>|<list-of-objects> { \"instances\" : [ { \"image_bytes\" : { \"b64\" : \"aW1hZ2UgYnl0ZXM=\" }, \"caption\" : \"seaside\" }, { \"image_bytes\" : { \"b64\" : \"YXdlc29tZSBpbWFnZSBieXRlcw==\" }, \"caption\" : \"mountains\" } ] } # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/image.json SERVICE_NAME = torch-transfomer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.kserve-triton.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3400 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 219 < content-type: application/json ; charset = UTF-8 < date: Sat, 19 Mar 2022 12 :15:54 GMT < server: istio-envoy < x-envoy-upstream-service-time: 41 < { \"OUTPUT__0\" : [[ -2.0964810848236084, -0.137007474899292, -0.5095658302307129, 2 .795621395111084, -0.560547947883606, 1 .9934231042861938, 1 .1288189888000488, - 4043136835098267 , 0 .600488007068634, -2.1237082481384277 ]]} %","title":"Run a prediction with curl"},{"location":"modelserving/v1beta1/xgboost/","text":"Deploying XGBoost models with InferenceService \u00b6 This example walks you through how to deploy a xgboost model using KServe's InferenceService CRD. Note that, by default it exposes your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through an API compatible with the Open Inference Protocol . Train the Model \u00b6 The first step will be to train a sample xgboost model. We will save this model as model.bst . import xgboost as xgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = xgb . DMatrix ( X , label = y ) param = { 'max_depth' : 6 , 'eta' : 0.1 , 'silent' : 1 , 'nthread' : 4 , 'num_class' : 10 , 'objective' : 'multi:softmax' } xgb_model = xgb . train ( params = param , dtrain = dtrain ) model_file = os . path . join (( model_dir ), BST_FILE ) xgb_model . save_model ( model_file ) Test the model locally \u00b6 Once you've got your model serialized model.bst , we can then use KServe XGBoost Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use kserve xgboost server locally, you will first need to install the xgbserver runtime package in your local environment. Clone the Kserve repository and navigate into the directory. git clone https://github.com/kserve/kserve Install xgbserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/xgbserver poetry install Serving model locally \u00b6 The xgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the xgbserver runtime package installed locally, you should now be ready to start our server as: python3 xgbserver --model_dir /path/to/model_dir --model_name xgboost-v2-iris Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, we use KServe to deploy our trained model on Kubernetes. For this, we use the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Assuming that we've got a cluster accessible through kubectl with KServe already installed, we can deploy our model as: kubectl apply -f xgboost.yaml Test the Deployed Model \u00b6 We can now test our deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/xgboost-v2-iris/infer The output will be something similar to: Expected Output { \"id\" : \"4e546709-0887-490a-abd6-00cbc4c26cf4\" , \"model_name\" : \"xgboost-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1.0 , 1.0 ], \"datatype\" : \"FP32\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f xgboost-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-iris-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"xgboost-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"xgboost-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"XGBoost"},{"location":"modelserving/v1beta1/xgboost/#deploying-xgboost-models-with-inferenceservice","text":"This example walks you through how to deploy a xgboost model using KServe's InferenceService CRD. Note that, by default it exposes your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through an API compatible with the Open Inference Protocol .","title":"Deploying XGBoost models with InferenceService"},{"location":"modelserving/v1beta1/xgboost/#train-the-model","text":"The first step will be to train a sample xgboost model. We will save this model as model.bst . import xgboost as xgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = xgb . DMatrix ( X , label = y ) param = { 'max_depth' : 6 , 'eta' : 0.1 , 'silent' : 1 , 'nthread' : 4 , 'num_class' : 10 , 'objective' : 'multi:softmax' } xgb_model = xgb . train ( params = param , dtrain = dtrain ) model_file = os . path . join (( model_dir ), BST_FILE ) xgb_model . save_model ( model_file )","title":"Train the Model"},{"location":"modelserving/v1beta1/xgboost/#test-the-model-locally","text":"Once you've got your model serialized model.bst , we can then use KServe XGBoost Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the model locally"},{"location":"modelserving/v1beta1/xgboost/#pre-requisites","text":"Firstly, to use kserve xgboost server locally, you will first need to install the xgbserver runtime package in your local environment. Clone the Kserve repository and navigate into the directory. git clone https://github.com/kserve/kserve Install xgbserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/xgbserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/xgboost/#serving-model-locally","text":"The xgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the xgbserver runtime package installed locally, you should now be ready to start our server as: python3 xgbserver --model_dir /path/to/model_dir --model_name xgboost-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/xgboost/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, we use KServe to deploy our trained model on Kubernetes. For this, we use the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Assuming that we've got a cluster accessible through kubectl with KServe already installed, we can deploy our model as: kubectl apply -f xgboost.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/xgboost/#test-the-deployed-model","text":"We can now test our deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/xgboost-v2-iris/infer The output will be something similar to: Expected Output { \"id\" : \"4e546709-0887-490a-abd6-00cbc4c26cf4\" , \"model_name\" : \"xgboost-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1.0 , 1.0 ], \"datatype\" : \"FP32\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/xgboost/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f xgboost-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/xgboost/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-iris-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"xgboost-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"xgboost-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"python_runtime_api/docs/","text":"KServe Python Runtime API \u00b6 KServe's python runtime API implements a standardized python model server API following open inference protocol . It encapsulates data plane API definitions and storage retrieval for models. It provides many functionalities, including among others: Implements the data plane API following open inference protocol. Provide extensible model server and model API. Allow customizing pre-processing, prediction and post-processing handlers. Readiness and liveness Handlers. Installation \u00b6 KServe Python SDK can be installed by pip or poetry . pip install \u00b6 pip install kserve Poetry \u00b6 Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install API Reference \u00b6 Please refer to API docs . Storage API \u00b6 The storage API is used by KServe Storage Initializer which supports the following cloud storage providers. The storage package is optional and can be installed via pip install kserve [ storage ] Google Cloud Storage with a prefix: \"gs://\" By default, it uses GOOGLE_APPLICATION_CREDENTIALS environment variable for user authentication. If GOOGLE_APPLICATION_CREDENTIALS is not provided, anonymous client will be used to download the artifacts. S3 Compatible Object Storage with a prefix \"s3://\" For static credentials it uses S3_ENDPOINT , AWS_ACCESS_KEY_ID , and AWS_SECRET_ACCESS_KEY environment variables for authentication. Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} By default, it uses anonymous client to download the artifacts. For e.g. https://kfserving.blob.core.windows.net/triton/simple_string/ Persistent Volume Claim (PVC) with the format pvc://{$pvcname}/[path] . The pvcname is the name of the PVC that contains the model. The [path] is the relative path to the model on the PVC. For e.g. pvc://mypvcname/model/path/on/pvc Generic URI over either HTTP prefixed with http:// or HTTPS prefixed with https:// . For example: https://<some_url>.com/model.joblib http://<some_url>.com/model.zip","title":"Python Runtime Server SDK"},{"location":"python_runtime_api/docs/#kserve-python-runtime-api","text":"KServe's python runtime API implements a standardized python model server API following open inference protocol . It encapsulates data plane API definitions and storage retrieval for models. It provides many functionalities, including among others: Implements the data plane API following open inference protocol. Provide extensible model server and model API. Allow customizing pre-processing, prediction and post-processing handlers. Readiness and liveness Handlers.","title":"KServe Python Runtime API"},{"location":"python_runtime_api/docs/#installation","text":"KServe Python SDK can be installed by pip or poetry .","title":"Installation"},{"location":"python_runtime_api/docs/#pip-install","text":"pip install kserve","title":"pip install"},{"location":"python_runtime_api/docs/#poetry","text":"Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install","title":"Poetry"},{"location":"python_runtime_api/docs/#api-reference","text":"Please refer to API docs .","title":"API Reference"},{"location":"python_runtime_api/docs/#storage-api","text":"The storage API is used by KServe Storage Initializer which supports the following cloud storage providers. The storage package is optional and can be installed via pip install kserve [ storage ] Google Cloud Storage with a prefix: \"gs://\" By default, it uses GOOGLE_APPLICATION_CREDENTIALS environment variable for user authentication. If GOOGLE_APPLICATION_CREDENTIALS is not provided, anonymous client will be used to download the artifacts. S3 Compatible Object Storage with a prefix \"s3://\" For static credentials it uses S3_ENDPOINT , AWS_ACCESS_KEY_ID , and AWS_SECRET_ACCESS_KEY environment variables for authentication. Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} By default, it uses anonymous client to download the artifacts. For e.g. https://kfserving.blob.core.windows.net/triton/simple_string/ Persistent Volume Claim (PVC) with the format pvc://{$pvcname}/[path] . The pvcname is the name of the PVC that contains the model. The [path] is the relative path to the model on the PVC. For e.g. pvc://mypvcname/model/path/on/pvc Generic URI over either HTTP prefixed with http:// or HTTPS prefixed with https:// . For example: https://<some_url>.com/model.joblib http://<some_url>.com/model.zip","title":"Storage API"},{"location":"python_runtime_api/docs/api/","text":"KServe Python Serving Runtime API \u00b6 ModelServer \u00b6 Source code in kserve/model_server.py 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 class ModelServer : def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : Optional [ ModelRepository ] = None , enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: A optional Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None``. it allows to override only the `uvicorn.access`'s format configuration with a richer set of fields (output hardcoded to `stdout`). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn [github issue](https://github.com/encode/uvicorn/issues/527) for more info). \"\"\" self . registered_models = ( ModelRepository () if registered_models is None else registered_models ) self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = self . registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension , kwargs = vars ( args ), ) if args . configure_logging : # If the logger does not have any handlers, then the logger is not configured. # For backward compatibility, we configure the logger here. if len ( logger . handlers ) == 0 : logging . configure_logging ( args . log_config_file ) self . access_log_format = access_log_format self . _custom_exception_handler = None async def _serve_rest ( self ): logger . info ( f \"Starting uvicorn with { self . workers } workers\" ) loop = asyncio . get_event_loop () if sys . platform not in [ \"win32\" , \"win64\" ]: sig_list = [ signal . SIGINT , signal . SIGTERM , signal . SIGQUIT ] else : sig_list = [ signal . SIGINT , signal . SIGTERM ] for sig in sig_list : loop . add_signal_handler ( sig , lambda s = sig : asyncio . create_task ( self . stop ( sig = s )) ) if self . _custom_exception_handler is None : loop . set_exception_handler ( self . default_exception_handler ) else : loop . set_exception_handler ( self . _custom_exception_handler ) self . _rest_server = UvicornServer ( app , self . http_port , self . dataplane , self . model_repository_extension , # By setting log_config to None we tell Uvicorn not to configure logging as it is already # configured by kserve. log_config = None , access_log_format = self . access_log_format , workers = self . workers , ) await self . _rest_server . run () def start ( self , models : List [ BaseKServeModel ]) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): at_least_one_model_ready = False for model in models : if isinstance ( model , BaseKServeModel ): if model . ready : at_least_one_model_ready = True self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging model . start () else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) if not at_least_one_model_ready and models : raise NoModelReady ( models ) else : raise RuntimeError ( \"Unknown model collection type\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def servers_task (): servers = [ self . _serve_rest ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" if \"exception\" in context : logger . error ( f \"Caught exception: { context . get ( 'exception' ) } \" ) logger . error ( f \"message: { context . get ( 'message' ) } \" ) loop . default_exception_handler ( context ) def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name ) __init__ ( http_port = args . http_port , grpc_port = args . grpc_port , workers = args . workers , max_threads = args . max_threads , max_asyncio_workers = args . max_asyncio_workers , registered_models = None , enable_grpc = args . enable_grpc , enable_docs_url = args . enable_docs_url , enable_latency_logging = args . enable_latency_logging , access_log_format = args . access_log_format ) \u00b6 KServe ModelServer Constructor Parameters: Name Type Description Default http_port int HTTP port. Default: 8080 . http_port grpc_port int GRPC port. Default: 8081 . grpc_port workers int Number of uvicorn workers. Default: 1 . workers max_threads int Max number of gRPC processing threads. Default: 4 max_threads max_asyncio_workers int Max number of AsyncIO threads. Default: None max_asyncio_workers registered_models Optional [ ModelRepository ] A optional Model repository with registered models. None enable_grpc bool Whether to turn on grpc server. Default: True enable_grpc enable_docs_url bool Whether to turn on /docs Swagger UI. Default: False . enable_docs_url enable_latency_logging bool Whether to log latency metric. Default: True . enable_latency_logging access_log_format str Format to set for the access log (provided by asgi-logger). Default: None . it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). access_log_format Source code in kserve/model_server.py 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : Optional [ ModelRepository ] = None , enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: A optional Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None``. it allows to override only the `uvicorn.access`'s format configuration with a richer set of fields (output hardcoded to `stdout`). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn [github issue](https://github.com/encode/uvicorn/issues/527) for more info). \"\"\" self . registered_models = ( ModelRepository () if registered_models is None else registered_models ) self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = self . registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension , kwargs = vars ( args ), ) if args . configure_logging : # If the logger does not have any handlers, then the logger is not configured. # For backward compatibility, we configure the logger here. if len ( logger . handlers ) == 0 : logging . configure_logging ( args . log_config_file ) self . access_log_format = access_log_format self . _custom_exception_handler = None default_exception_handler ( loop , context ) \u00b6 Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. Source code in kserve/model_server.py 341 342 343 344 345 346 347 348 349 350 351 352 def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" if \"exception\" in context : logger . error ( f \"Caught exception: { context . get ( 'exception' ) } \" ) logger . error ( f \"message: { context . get ( 'message' ) } \" ) loop . default_exception_handler ( context ) register_exception_handler ( handler ) \u00b6 Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see call_exception_handler() documentation for details about context). Source code in kserve/model_server.py 327 328 329 330 331 332 333 334 335 336 337 338 339 def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler register_model ( model ) \u00b6 Register a model to the model server. Parameters: Name Type Description Default model BaseKServeModel The model object. required Source code in kserve/model_server.py 354 355 356 357 358 359 360 361 362 363 def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name ) start ( models ) \u00b6 Start the model server with a set of registered models. Parameters: Name Type Description Default models List [ BaseKServeModel ] a list of models to register to the model server. required Source code in kserve/model_server.py 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 def start ( self , models : List [ BaseKServeModel ]) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): at_least_one_model_ready = False for model in models : if isinstance ( model , BaseKServeModel ): if model . ready : at_least_one_model_ready = True self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging model . start () else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) if not at_least_one_model_ready and models : raise NoModelReady ( models ) else : raise RuntimeError ( \"Unknown model collection type\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def servers_task (): servers = [ self . _serve_rest ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) stop ( sig = None ) async \u00b6 Stop the instances of REST and gRPC model servers. Parameters: Name Type Description Default sig Optional [ int ] The signal to stop the server. Default: None . None Source code in kserve/model_server.py 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) BaseKServeModel \u00b6 Bases: ABC A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. Source code in kserve/model.py 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 class BaseKServeModel ( ABC ): \"\"\" A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. \"\"\" def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False async def healthy ( self ) -> bool : \"\"\" Check the health of this model. By default returns `self.ready`. Returns: True if healthy, false otherwise \"\"\" return self . ready def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready def start ( self ): \"\"\"Start handler can be overridden to perform model setup\"\"\" self . ready = True def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" self . ready = False __init__ ( name ) \u00b6 Adds the required attributes Parameters: Name Type Description Default name str The name of the model. required Source code in kserve/model.py 49 50 51 52 53 54 55 56 57 def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False healthy () async \u00b6 Check the health of this model. By default returns self.ready . Returns: Type Description bool True if healthy, false otherwise Source code in kserve/model.py 59 60 61 62 63 64 65 66 async def healthy ( self ) -> bool : \"\"\" Check the health of this model. By default returns `self.ready`. Returns: True if healthy, false otherwise \"\"\" return self . ready load () \u00b6 Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 68 69 70 71 72 73 74 75 76 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready start () \u00b6 Start handler can be overridden to perform model setup Source code in kserve/model.py 78 79 80 def start ( self ): \"\"\"Start handler can be overridden to perform model setup\"\"\" self . ready = True stop () \u00b6 Stop handler can be overridden to perform model teardown Source code in kserve/model.py 82 83 84 def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" self . ready = False InferenceModel \u00b6 Bases: BaseKServeModel Abstract class representing a model that supports standard inference and prediction. Source code in kserve/model.py 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 class InferenceModel ( BaseKServeModel ): \"\"\" Abstract class representing a model that supports standard inference and prediction. \"\"\" @abstractmethod def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : pass def get_input_types ( self ) -> List [ Dict ]: # Override this function to return appropriate input format expected by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] def get_output_types ( self ) -> List [ Dict ]: # Override this function to return appropriate output format returned by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] Model \u00b6 Bases: InferenceModel Source code in kserve/model.py 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 class Model ( InferenceModel ): def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response @property def _http_client ( self ) -> InferenceRESTClient : if self . _http_client_instance is None and self . predictor_host : config = RESTConfig ( protocol = self . protocol , timeout = self . timeout , retries = 3 ) self . _http_client_instance = InferenceRESTClient ( config = config ) return self . _http_client_instance @property def _grpc_client ( self ) -> InferenceGRPCClient : if self . _grpc_client_stub is None and self . predictor_host : self . _grpc_client_stub = InferenceGRPCClient ( url = self . predictor_host , use_ssl = self . use_ssl , timeout = self . timeout ) return self . _grpc_client_stub def validate ( self , payload ): if isinstance ( payload , ModelInferRequest ): return payload if isinstance ( payload , InferRequest ): return payload # TODO: validate the request if self.get_input_types() defines the input types. if self . protocol == PredictorProtocol . REST_V2 . value : if \"inputs\" in payload and not isinstance ( payload [ \"inputs\" ], list ): raise InvalidInput ( 'Expected \"inputs\" to be a list' ) elif self . protocol == PredictorProtocol . REST_V1 . value : if ( isinstance ( payload , Dict ) and \"instances\" in payload and not isinstance ( payload [ \"instances\" ], list ) ): raise InvalidInput ( 'Expected \"instances\" to be a list' ) return payload def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result async def _http_predict ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: # Adjusting headers. Inject content type if not exist. # Also, removing host, as the header is the one passed to transformer and contains transformer's host predict_headers = { \"Content-Type\" : \"application/json\" } if headers is not None : if \"x-request-id\" in headers : predict_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : predict_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" predict_base_url = PREDICTOR_BASE_URL_FORMAT . format ( protocol , self . predictor_host ) response = await self . _http_client . infer ( predict_base_url , model_name = self . name , data = payload , headers = predict_headers , ) return response async def _grpc_predict ( self , payload : Union [ ModelInferRequest , InferRequest ], headers : Dict [ str , str ] = None , ) -> InferResponse : if isinstance ( payload , ModelInferRequest ): payload = InferRequest . from_grpc ( payload ) async_result = await self . _grpc_client . infer ( infer_request = payload , headers = ( ( \"request_type\" , \"grpc_v2\" ), ( \"response_type\" , \"grpc_v2\" ), ( \"x-request-id\" , headers . get ( \"x-request-id\" , \"\" )), ), ) return async_result async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : return await self . _grpc_predict ( payload , headers ) else : return await self . _http_predict ( payload , headers ) async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) explain_headers = { \"content-type\" : \"application/json\" } if headers is not None : if \"content-type\" in headers : explain_headers [ \"content-type\" ] = headers [ \"content-type\" ] if \"x-request-id\" in headers : explain_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : explain_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_base_url = EXPLAINER_BASE_URL_FORMAT . format ( protocol , self . explainer_host ) response = await self . _http_client . explain ( explain_base_url , model_name = self . name , data = payload , headers = explain_headers , ) return response __call__ ( body , headers = None , verb = InferenceVerb . PREDICT ) async \u00b6 Method to call predictor or explainer with the given input. Parameters: Name Type Description Default body Union [ Dict , CloudEvent , InferRequest ] Request body. required verb InferenceVerb The inference verb for predict/generate/explain PREDICT headers Optional [ Dict [ str , str ]] Request headers. None Returns: Type Description InferReturnType Response output from preprocess -> predict/generate/explain -> postprocess Source code in kserve/model.py 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response __init__ ( name , predictor_config = None ) \u00b6 KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Parameters: Name Type Description Default name str The name of the model. required predictor_config Optional [ PredictorConfig ] The configurations for http call to the predictor. None Source code in kserve/model.py 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False explain ( payload , headers = None ) async \u00b6 explain handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if explainer_host is specified. Parameters: Name Type Description Default payload Dict Explainer model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Dict An Explanation for the inference result. Source code in kserve/model.py 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) explain_headers = { \"content-type\" : \"application/json\" } if headers is not None : if \"content-type\" in headers : explain_headers [ \"content-type\" ] = headers [ \"content-type\" ] if \"x-request-id\" in headers : explain_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : explain_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_base_url = EXPLAINER_BASE_URL_FORMAT . format ( protocol , self . explainer_host ) response = await self . _http_client . explain ( explain_base_url , model_name = self . name , data = payload , headers = explain_headers , ) return response load () \u00b6 Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 298 299 300 301 302 303 304 305 306 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready postprocess ( result , headers = None ) async \u00b6 The postprocess handler can be overridden for inference result or response transformation. The predictor sends back the inference result in Dict for v1 endpoints and InferResponse for v2 endpoints. Parameters: Name Type Description Default result Union [ Dict , InferResponse ] The inference result passed from predict handler or the HTTP response from predictor. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse ] A Dict or InferResponse after post-process to return back to the client. Source code in kserve/model.py 325 326 327 328 329 330 331 332 333 334 335 336 337 338 async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result predict ( payload , headers = None ) async \u00b6 The predict handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Parameters: Name Type Description Default payload Union [ Dict , InferRequest , ModelInferRequest ] Model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse , AsyncIterator [ Any ]] Inference result or a Response from the predictor. Source code in kserve/model.py 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : return await self . _grpc_predict ( payload , headers ) else : return await self . _http_predict ( payload , headers ) preprocess ( payload , headers = None ) async \u00b6 preprocess handler can be overridden for data or feature transformation. The model decodes the request body to Dict for v1 endpoints and InferRequest for v2 endpoints. Parameters: Name Type Description Default payload Union [ Dict , InferRequest ] Payload of the request. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferRequest ] A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Union [ Dict , InferRequest ] Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. Source code in kserve/model.py 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload PredictorConfig \u00b6 Source code in kserve/model.py 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 class PredictorConfig : def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds __init__ ( predictor_host , predictor_protocol = PredictorProtocol . REST_V1 . value , predictor_use_ssl = False , predictor_request_timeout_seconds = 600 ) \u00b6 The configuration for the http call to the predictor Parameters: Name Type Description Default predictor_host str The host name of the predictor required predictor_protocol str The inference protocol used for predictor http call REST_V1 .value predictor_use_ssl bool Enable using ssl for http connection to the predictor False predictor_request_timeout_seconds int The request timeout seconds for the predictor http call 600 Source code in kserve/model.py 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds InferInput \u00b6 Source code in kserve/protocol/infer_type.py 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 class InferInput : _name : str _shape : List [ int ] _datatype : str _parameters : Dict def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference input associated with this object. Returns: The name of the inference input \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the datatype of inference input associated with this object. Returns: The datatype of the inference input. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of the inference input associated with this object. Returns: The data of the inference input. \"\"\" return self . _data @data . setter def data ( self , data : List ): \"\"\"Set the data of the inference input associated with this object. Args: data: data of the inference input. \"\"\" self . _data = data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference input associated with this object. Returns: The shape of the inference input. \"\"\" return self . _shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of the inference input associated with this object. Returns: The additional inference parameters \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] ): \"\"\"Set the parameters of the inference input associated with this object. Args: params: parameters of the inference input \"\"\" self . _parameters = params @shape . setter def shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference input. Args: shape : The shape of the associated inference input. \"\"\" self . _shape = shape def as_string ( self ) -> List [ List [ str ]]: \"\"\" Decodes the inference input data as a list of strings. Returns: List[List[str]]: The decoded data as a list of strings. Raises: InvalidInput: If the datatype of the inference input is not 'BYTES'. \"\"\" if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" ) def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data Raises: InvalidInput: If the datatype of the inference input is not recognized. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferInput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False if self . _raw_data != other . _raw_data : return False return True @parameters . setter def parameters ( self , value ): self . _parameters = value def to_dict ( self ) -> dict : return { \"name\" : self . name , \"shape\" : self . shape , \"datatype\" : self . datatype , \"data\" : self . data , \"parameters\" : self . parameters , } def __repr__ ( self ) -> str : return ( f '\"name\": \" { self . name } \",' f '\"shape\": { self . shape } ,' f '\"datatype\": \" { self . datatype } \",' f '\"data\": { self . data } ,' f '\"parameters\": { self . parameters } ' ) def __str__ ( self ) -> str : return self . __repr__ () data : Union [ List , np . ndarray , InferTensorContents ] property writable \u00b6 Get the data of the inference input associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of the inference input. datatype : str property \u00b6 Get the datatype of inference input associated with this object. Returns: Type Description str The datatype of the inference input. name : str property \u00b6 Get the name of inference input associated with this object. Returns: Type Description str The name of the inference input parameters : Union [ Dict , MessageMap [ str , InferParameter ], None ] property writable \u00b6 Get the parameters of the inference input associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters shape : List [ int ] property writable \u00b6 Get the shape of inference input associated with this object. Returns: Type Description List [ int ] The shape of the inference input. __init__ ( name , shape , datatype , data = None , parameters = None ) \u00b6 An object of InferInput class is used to describe the input tensor of an inference request. Parameters: Name Type Description Default name str The name of the inference input whose data will be described by this object. required shape The shape of the associated inference input. required datatype The data type of the associated inference input. required data The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using set_data_from_numpy . None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None as_numpy () \u00b6 Decode the inference input data as numpy array. Returns: Type Description ndarray A numpy array of the inference input data Source code in kserve/protocol/infer_type.py 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data Raises: InvalidInput: If the datatype of the inference input is not recognized. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) as_string () \u00b6 Decodes the inference input data as a list of strings. Returns: Type Description List [ List [ str ]] List[List[str]]: The decoded data as a list of strings. Raises: Type Description InvalidInput If the datatype of the inference input is not 'BYTES'. Source code in kserve/protocol/infer_type.py 220 221 222 223 224 225 226 227 228 229 230 231 232 233 def as_string ( self ) -> List [ List [ str ]]: \"\"\" Decodes the inference input data as a list of strings. Returns: List[List[str]]: The decoded data as a list of strings. Raises: InvalidInput: If the datatype of the inference input is not 'BYTES'. \"\"\" if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" ) set_data_from_numpy ( input_tensor , binary_data = True ) \u00b6 Set the tensor data from the specified numpy array for input associated with this object. Parameters: Name Type Description Default input_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) InferOutput \u00b6 Source code in kserve/protocol/infer_type.py 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 class InferOutput : def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference output associated with this object. Returns: The name of inference output. \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the data type of inference output associated with this object. Returns: The data type of inference output. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of inference output associated with this object. Returns: The data of inference output. \"\"\" return self . _data @data . setter def data ( self , data : Union [ List , np . ndarray , InferTensorContents ]): \"\"\"Set the data of inference output associated with this object. Args: data: inference output data. \"\"\" self . _data = data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference output associated with this object. Returns: The shape of inference output \"\"\" return self . _shape @shape . setter def shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference output. Args: shape: The shape of the associated inference output. \"\"\" self . _shape = shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of inference output associated with this object. Returns: The additional inference parameters associated with the inference output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Union [ Dict , MessageMap [ str , InferParameter ]]): \"\"\"Set the parameters of inference output associated with this object. :param params: The parameters of inference output \"\"\" self . _parameters = params def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferOutput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False if self . _raw_data != other . _raw_data : return False return True def to_dict ( self ) -> dict : return { \"name\" : self . name , \"shape\" : self . shape , \"datatype\" : self . datatype , \"data\" : self . data , \"parameters\" : self . parameters , } def __repr__ ( self ) -> str : return ( f '\"name\": \" { self . name } \",' f '\"shape\": { self . shape } ,' f '\"datatype\": \" { self . datatype } \",' f '\"data\": { self . data } ,' f '\"parameters\": { self . parameters } ' ) def __str__ ( self ) -> str : return self . __repr__ () data : Union [ List , np . ndarray , InferTensorContents ] property writable \u00b6 Get the data of inference output associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of inference output. datatype : str property \u00b6 Get the data type of inference output associated with this object. Returns: Type Description str The data type of inference output. name : str property \u00b6 Get the name of inference output associated with this object. Returns: Type Description str The name of inference output. parameters : Union [ Dict , MessageMap [ str , InferParameter ], None ] property writable \u00b6 Get the parameters of inference output associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters associated with the inference output. shape : List [ int ] property writable \u00b6 Get the shape of inference output associated with this object. Returns: Type Description List [ int ] The shape of inference output __init__ ( name , shape , datatype , data = None , parameters = None ) \u00b6 An object of InferOutput class is used to describe the output tensor for an inference response. Parameters: Name Type Description Default name The name of inference output whose data will be described by this object. required shape The shape of the associated inference output. required datatype The data type of the associated inference output. required data The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None as_numpy () \u00b6 Decode the tensor output data as numpy array. Returns: Type Description ndarray The numpy array of the associated inference output data. Source code in kserve/protocol/infer_type.py 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) set_data_from_numpy ( output_tensor , binary_data = True ) \u00b6 Set the tensor data from the specified numpy array for the inference output associated with this object. Parameters: Name Type Description Default output_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) InferRequest \u00b6 Source code in kserve/protocol/infer_type.py 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 class InferRequest : id : Optional [ str ] model_name : str parameters : Optional [ Dict ] inputs : List [ InferInput ] from_grpc : bool def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , request_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. request_outputs: The output tensors requested for this inference. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc self . _use_raw_outputs = False if raw_inputs : self . _use_raw_outputs = True for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input self . request_outputs = request_outputs @property def use_binary_outputs ( self ) -> bool : \"\"\"This attribute is used to determine if all the outputs should be returned as raw binary format. For REST, Get the binary_data_output attribute from the parameters. This will be ovverided by the individual output's 'binary_data' parameter. For GRPC, It is True, if the received inputs are raw_inputs, otherwise False. For GRPC, if the inputs are raw_inputs, then the outputs should be returned as raw_outputs. Returns: a boolean indicating whether to use binary raw outputs \"\"\" # If the request is from gRPC and we receive the inputs as raw inputs, then the outputs should be returned as raw binary format. if self . _use_raw_outputs and self . from_grpc : return True # If the request is from REST and the 'use_binary_outputs' parameter is set to True, then the outputs should be returned as raw binary format. elif self . parameters and not self . from_grpc : # If it is a grpc request, then this configuration has no effect on the output. return self . parameters . get ( \"binary_data_output\" , False ) else : return False @classmethod def from_grpc ( cls , request : ModelInferRequest ) -> \"InferRequest\" : \"\"\" Class method to construct an InferRequest object from a ModelInferRequest object. Args: request (ModelInferRequest): The gRPC ModelInferRequest object to be converted. Returns: InferRequest: The resulting InferRequest object. \"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = ( to_http_parameters ( input_tensor . parameters ) if input_tensor . parameters else None ), ) for input_tensor in request . inputs ] request_outputs = [ RequestedOutput ( name = output . name , parameters = ( to_http_parameters ( output . parameters ) if output . parameters else None ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , request_outputs = request_outputs if request_outputs else None , ) @classmethod def from_bytes ( cls , req_bytes : bytes , json_length : int , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from REST raw request bytes. Args: req_bytes (bytes): The raw InferRequest in bytes. json_length (int): The length of the json bytes. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized or if necessary fields are missing. \"\"\" json_bytes = req_bytes [: json_length ] try : infer_req_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) infer_inputs = [] # Read the raw binary inputs appended after json start_index = json_length for input_ in infer_req_dict [ \"inputs\" ]: parameters = input_ . get ( \"parameters\" , None ) infer_input = InferInput ( name = input_ [ \"name\" ], shape = input_ [ \"shape\" ], datatype = input_ [ \"datatype\" ], parameters = parameters , ) infer_input_data = input_ . get ( \"data\" , None ) if infer_input_data is not None : if infer_input . datatype == \"FP16\" : raise InvalidInput ( f \"Receiving FP16 data via JSON is not supported. Please use the binary data format \" f \"for input { infer_input . name } \" ) infer_input . data = infer_input_data elif parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" , None ) if binary_data_size is None : raise InvalidInput ( f \"'binary_data_size' is not specified for input ' { infer_input . name } ' for model ' { model_name } '\" ) end_index = start_index + binary_data_size infer_input . _raw_data = req_bytes [ start_index : end_index ] infer_input . set_data_from_numpy ( infer_input . as_numpy (), binary_data = False ) start_index = end_index else : raise InvalidInput ( f \"'data' field is missing for input ' { infer_input . name } ' for model ' { model_name } '\" ) infer_inputs . append ( infer_input ) requested_outputs = None if infer_req_dict . get ( \"outputs\" , None ) is not None : requested_outputs = [ RequestedOutput ( name = output [ \"name\" ], parameters = output . get ( \"parameters\" , None ), ) for output in infer_req_dict [ \"outputs\" ] ] return cls ( model_name = model_name , request_id = infer_req_dict . get ( \"id\" , None ), parameters = infer_req_dict . get ( \"parameters\" , None ), infer_inputs = infer_inputs , request_outputs = requested_outputs , ) @classmethod def from_inference_request ( cls , request : InferenceRequest , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from InferenceRequest object. Args: request (InferenceRequest): The InferenceRequest object. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized. \"\"\" infer_inputs = [] for infer_input in request . inputs : if infer_input . datatype == \"FP16\" and len ( infer_input . data ) != 0 : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. \" f \"Please use the binary data format for input { infer_input . name } \" ) infer_inputs . append ( InferInput ( name = infer_input . name , shape = infer_input . shape , datatype = infer_input . datatype , data = infer_input . data , parameters = ( {} if infer_input . parameters is None else infer_input . parameters ), ) ) requested_outputs = None if request . outputs : requested_outputs = [ RequestedOutput ( name = output . name , parameters = ({} if output . parameters is None else output . parameters ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = model_name , infer_inputs = infer_inputs , parameters = request . parameters , request_outputs = requested_outputs , ) def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: InvalidInput: If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. \"\"\" infer_inputs = [] raw_inputs = [] for infer_input in self . inputs : if infer_input . data is None and infer_input . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_input . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) if infer_input . data and infer_input . _raw_data : raise InvalidInput ( f \"Both 'data' and 'raw_data' fields are set for input ' { infer_input . name } ' for model ' { self . model_name } '\" ) if infer_input . datatype == \"FP16\" and infer_input . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for input { infer_input . name } \" ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if infer_input . _raw_data : raw_inputs . append ( infer_input . _raw_data ) else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) requested_outputs = [] if self . request_outputs : for requested_output in self . request_outputs : requested_output_dict = { \"name\" : requested_output . name , } if requested_output . parameters : requested_output_dict [ \"parameters\" ] = to_http_parameters ( requested_output . parameters ) requested_outputs . append ( requested_output_dict ) res = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"model_name\" : self . model_name , \"inputs\" : infer_inputs , } if requested_outputs : res [ \"outputs\" ] = requested_outputs if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_inputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_inputs ) return infer_response_bytes , json_length return res , None def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: ModelInferRequest gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) request_outputs = [] if self . request_outputs : for request_output in self . request_outputs : request_output_dict = { \"name\" : request_output . name } if request_output . parameters : request_output_dict [ \"parameters\" ] = to_grpc_parameters ( request_output . parameters ) request_outputs . append ( request_output_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , outputs = request_outputs if request_outputs else None , ) def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) def get_input_by_name ( self , name : str ) -> Optional [ InferInput ]: \"\"\"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists \"\"\" for infer_input in self . inputs : if name == infer_input . name : return infer_input return None def __eq__ ( self , other ): if not isinstance ( other , InferRequest ): return False if self . model_name != other . model_name : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . inputs != other . inputs : return False if self . request_outputs != other . request_outputs : return False return True def to_dict ( self ) -> dict : return { \"id\" : self . id , \"model_name\" : self . model_name , \"inputs\" : [ infer_input . to_dict () for infer_input in self . inputs ], \"parameters\" : self . parameters , \"from_grpc\" : self . from_grpc , } def __repr__ ( self ) -> str : return ( f '\"id\": \" { self . id } \",' f '\"model_name\": \" { self . model_name } \",' f '\"inputs\": { self . inputs . __repr__ () } ,' f '\"parameters\": { self . parameters } ,' f '\"from_grpc\": { self . from_grpc } ' ) def __str__ ( self ) -> str : return self . __repr__ () use_binary_outputs : bool property \u00b6 This attribute is used to determine if all the outputs should be returned as raw binary format. For REST, Get the binary_data_output attribute from the parameters. This will be ovverided by the individual output's 'binary_data' parameter. For GRPC, It is True, if the received inputs are raw_inputs, otherwise False. For GRPC, if the inputs are raw_inputs, then the outputs should be returned as raw_outputs. Returns: Type Description bool a boolean indicating whether to use binary raw outputs __init__ ( model_name , infer_inputs , request_id = None , raw_inputs = None , from_grpc = False , parameters = None , request_outputs = None ) \u00b6 InferRequest Data Model. Parameters: Name Type Description Default model_name str The model name. required infer_inputs List [ InferInput ] The inference inputs for the model. required request_id Optional [ str ] The id for the inference request. None raw_inputs The binary data for the inference inputs. None from_grpc Optional [ bool ] Indicate if the data model is constructed from gRPC request. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None request_outputs Optional [ List [ RequestedOutput ]] The output tensors requested for this inference. None Source code in kserve/protocol/infer_type.py 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , request_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. request_outputs: The output tensors requested for this inference. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc self . _use_raw_outputs = False if raw_inputs : self . _use_raw_outputs = True for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input self . request_outputs = request_outputs as_dataframe () \u00b6 Decode the tensor inputs as pandas dataframe. Returns: Type Description DataFrame The inference input data as pandas dataframe Source code in kserve/protocol/infer_type.py 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) from_bytes ( req_bytes , json_length , model_name ) classmethod \u00b6 The class method to construct the InferRequest object from REST raw request bytes. Parameters: Name Type Description Default req_bytes bytes The raw InferRequest in bytes. required json_length int The length of the json bytes. required model_name str The name of the model. required Returns: Name Type Description InferRequest InferRequest The resulting InferRequest object. Raises: Type Description InvalidInput If the request format is unrecognized or if necessary fields are missing. Source code in kserve/protocol/infer_type.py 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 @classmethod def from_bytes ( cls , req_bytes : bytes , json_length : int , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from REST raw request bytes. Args: req_bytes (bytes): The raw InferRequest in bytes. json_length (int): The length of the json bytes. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized or if necessary fields are missing. \"\"\" json_bytes = req_bytes [: json_length ] try : infer_req_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) infer_inputs = [] # Read the raw binary inputs appended after json start_index = json_length for input_ in infer_req_dict [ \"inputs\" ]: parameters = input_ . get ( \"parameters\" , None ) infer_input = InferInput ( name = input_ [ \"name\" ], shape = input_ [ \"shape\" ], datatype = input_ [ \"datatype\" ], parameters = parameters , ) infer_input_data = input_ . get ( \"data\" , None ) if infer_input_data is not None : if infer_input . datatype == \"FP16\" : raise InvalidInput ( f \"Receiving FP16 data via JSON is not supported. Please use the binary data format \" f \"for input { infer_input . name } \" ) infer_input . data = infer_input_data elif parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" , None ) if binary_data_size is None : raise InvalidInput ( f \"'binary_data_size' is not specified for input ' { infer_input . name } ' for model ' { model_name } '\" ) end_index = start_index + binary_data_size infer_input . _raw_data = req_bytes [ start_index : end_index ] infer_input . set_data_from_numpy ( infer_input . as_numpy (), binary_data = False ) start_index = end_index else : raise InvalidInput ( f \"'data' field is missing for input ' { infer_input . name } ' for model ' { model_name } '\" ) infer_inputs . append ( infer_input ) requested_outputs = None if infer_req_dict . get ( \"outputs\" , None ) is not None : requested_outputs = [ RequestedOutput ( name = output [ \"name\" ], parameters = output . get ( \"parameters\" , None ), ) for output in infer_req_dict [ \"outputs\" ] ] return cls ( model_name = model_name , request_id = infer_req_dict . get ( \"id\" , None ), parameters = infer_req_dict . get ( \"parameters\" , None ), infer_inputs = infer_inputs , request_outputs = requested_outputs , ) from_grpc ( request ) classmethod \u00b6 Class method to construct an InferRequest object from a ModelInferRequest object. Parameters: Name Type Description Default request ModelInferRequest The gRPC ModelInferRequest object to be converted. required Returns: Name Type Description InferRequest InferRequest The resulting InferRequest object. Source code in kserve/protocol/infer_type.py 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 @classmethod def from_grpc ( cls , request : ModelInferRequest ) -> \"InferRequest\" : \"\"\" Class method to construct an InferRequest object from a ModelInferRequest object. Args: request (ModelInferRequest): The gRPC ModelInferRequest object to be converted. Returns: InferRequest: The resulting InferRequest object. \"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = ( to_http_parameters ( input_tensor . parameters ) if input_tensor . parameters else None ), ) for input_tensor in request . inputs ] request_outputs = [ RequestedOutput ( name = output . name , parameters = ( to_http_parameters ( output . parameters ) if output . parameters else None ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , request_outputs = request_outputs if request_outputs else None , ) from_inference_request ( request , model_name ) classmethod \u00b6 The class method to construct the InferRequest object from InferenceRequest object. Parameters: Name Type Description Default request InferenceRequest The InferenceRequest object. required model_name str The name of the model. required Source code in kserve/protocol/infer_type.py 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 @classmethod def from_inference_request ( cls , request : InferenceRequest , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from InferenceRequest object. Args: request (InferenceRequest): The InferenceRequest object. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized. \"\"\" infer_inputs = [] for infer_input in request . inputs : if infer_input . datatype == \"FP16\" and len ( infer_input . data ) != 0 : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. \" f \"Please use the binary data format for input { infer_input . name } \" ) infer_inputs . append ( InferInput ( name = infer_input . name , shape = infer_input . shape , datatype = infer_input . datatype , data = infer_input . data , parameters = ( {} if infer_input . parameters is None else infer_input . parameters ), ) ) requested_outputs = None if request . outputs : requested_outputs = [ RequestedOutput ( name = output . name , parameters = ({} if output . parameters is None else output . parameters ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = model_name , infer_inputs = infer_inputs , parameters = request . parameters , request_outputs = requested_outputs , ) get_input_by_name ( name ) \u00b6 Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists Source code in kserve/protocol/infer_type.py 860 861 862 863 864 865 866 867 868 869 870 871 872 873 def get_input_by_name ( self , name : str ) -> Optional [ InferInput ]: \"\"\"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists \"\"\" for infer_input in self . inputs : if name == infer_input . name : return infer_input return None to_grpc () \u00b6 Converts the InferRequest object to gRPC ModelInferRequest type. Returns: Type Description ModelInferRequest ModelInferRequest gRPC type converted from InferRequest object. Source code in kserve/protocol/infer_type.py 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: ModelInferRequest gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) request_outputs = [] if self . request_outputs : for request_output in self . request_outputs : request_output_dict = { \"name\" : request_output . name } if request_output . parameters : request_output_dict [ \"parameters\" ] = to_grpc_parameters ( request_output . parameters ) request_outputs . append ( request_output_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , outputs = request_outputs if request_outputs else None , ) to_rest () \u00b6 Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Type Description Tuple [ Union [ bytes , Dict ], Optional [ int ]] Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: Type Description InvalidInput If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. Source code in kserve/protocol/infer_type.py 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: InvalidInput: If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. \"\"\" infer_inputs = [] raw_inputs = [] for infer_input in self . inputs : if infer_input . data is None and infer_input . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_input . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) if infer_input . data and infer_input . _raw_data : raise InvalidInput ( f \"Both 'data' and 'raw_data' fields are set for input ' { infer_input . name } ' for model ' { self . model_name } '\" ) if infer_input . datatype == \"FP16\" and infer_input . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for input { infer_input . name } \" ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if infer_input . _raw_data : raw_inputs . append ( infer_input . _raw_data ) else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) requested_outputs = [] if self . request_outputs : for requested_output in self . request_outputs : requested_output_dict = { \"name\" : requested_output . name , } if requested_output . parameters : requested_output_dict [ \"parameters\" ] = to_http_parameters ( requested_output . parameters ) requested_outputs . append ( requested_output_dict ) res = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"model_name\" : self . model_name , \"inputs\" : infer_inputs , } if requested_outputs : res [ \"outputs\" ] = requested_outputs if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_inputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_inputs ) return infer_response_bytes , json_length return res , None InferResponse \u00b6 Source code in kserve/protocol/infer_type.py 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 class InferResponse : id : str model_name : str model_version : Optional [ str ] parameters : Optional [ Dict ] outputs : List [ InferOutput ] from_grpc : bool def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , use_binary_outputs : Optional [ bool ] = False , requested_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. use_binary_outputs: A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. requested_outputs: The output tensors requested for this inference. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc self . _requested_outputs = requested_outputs self . _use_binary_outputs : bool = use_binary_outputs if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type. Args: response: The GRPC response as ModelInferResponse object. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) @classmethod def from_rest ( cls , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type. Args: response: The response as a dict. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = response . get ( \"model_name\" ), model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) @classmethod def from_bytes ( cls , res_bytes : bytes , json_length : int , ) -> \"InferResponse\" : \"\"\" Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Args: res_bytes (bytes): The raw response bytes received from the REST API. json_length (int): The length of the JSON part of the response. Returns: InferResponse: The constructed InferResponse object. Raises: InvalidInput: If the response format is unrecognized or if necessary fields are missing in the response. InferenceError: if failed to set data for the output tensor. \"\"\" # If json_length is equal to the length of the response bytes, then the response does not have # any appended binary data after the json. json_bytes = res_bytes [: json_length ] try : infer_res_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) model_name = infer_res_dict [ \"model_name\" ] infer_outputs = [] # Read the raw binary outputs appended after json start_index = json_length for output in infer_res_dict [ \"outputs\" ]: parameters = output . get ( \"parameters\" , None ) infer_output = InferOutput ( name = output [ \"name\" ], shape = output [ \"shape\" ], datatype = output [ \"datatype\" ], parameters = parameters , ) if parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" ) end_index = start_index + binary_data_size infer_output . _raw_data = res_bytes [ start_index : end_index ] infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = False ) start_index = end_index else : infer_output_data = output . get ( \"data\" , None ) if infer_output_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { model_name } '\" ) infer_output . data = infer_output_data infer_outputs . append ( infer_output ) return cls ( model_name = model_name , response_id = infer_res_dict . get ( \"id\" , None ), parameters = infer_res_dict . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: InvalidInput: If the output data is not a numpy array, bytes, or list. \"\"\" infer_outputs = [] raw_outputs = [] use_binary_data = self . _use_binary_outputs outputs = self . _requested_outputs if self . _requested_outputs else self . outputs for output in outputs : infer_output = ( self . get_output_by_name ( output . name ) if self . _requested_outputs else output ) if self . _requested_outputs : use_binary_data = output . binary_data if infer_output is None : raise InvalidInput ( f \"Unexpected inference output ' { output . name } ' for model ' { self . model_name } '\" ) if infer_output . data is None and infer_output . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = use_binary_data ) elif infer_output . data or infer_output . _raw_data : infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = use_binary_data ) if infer_output . datatype == \"FP16\" and infer_output . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for output { infer_output . name } \" ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if use_binary_data : raw_outputs . append ( infer_output . _raw_data ) else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_outputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_outputs ) return infer_response_bytes , json_length return res , None def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. Raises: InvalidInput: If the output data is not a List or if the datatype is invalid. \"\"\" infer_outputs = [] raw_output_contents = [] use_raw_outputs = self . _use_binary_outputs if not self . _use_binary_outputs : # If FP16 datatype is present in the outputs use raw outputs. if _contains_fp16_datatype ( self ): use_raw_outputs = True for infer_output in self . outputs : if ( use_raw_outputs and infer_output . data and isinstance ( infer_output . data , list ) ): infer_output . data = infer_output . as_numpy () if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) def get_output_by_name ( self , name : str ) -> Optional [ InferOutput ]: \"\"\"Find an output Tensor in the InferResponse that has the given name Args: name : str name of the output Tensor object Returns: InferOutput The InferOutput with the specified name, or None if no output with this name exists \"\"\" for infer_output in self . outputs : if name == infer_output . name : return infer_output return None def __eq__ ( self , other ): if not isinstance ( other , InferResponse ): return False if self . model_name != other . model_name : return False if self . model_version != other . model_version : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . outputs != other . outputs : return False return True def to_dict ( self ) -> dict : return { \"id\" : self . id , \"model_name\" : self . model_name , \"outputs\" : [ infer_output . to_dict () for infer_output in self . outputs ], \"parameters\" : self . parameters , \"from_grpc\" : self . from_grpc , } def __repr__ ( self ) -> str : return ( f '\"id\": \" { self . id } \",' f '\"model_name\": \" { self . model_name } \",' f '\"outputs\": { self . outputs . __repr__ () } ,' f '\"parameters\": { self . parameters } ,' f '\"from_grpc\": { self . from_grpc } ' ) def __str__ ( self ) -> str : return self . __repr__ () __init__ ( response_id , model_name , infer_outputs , model_version = None , raw_outputs = None , from_grpc = False , parameters = None , use_binary_outputs = False , requested_outputs = None ) \u00b6 The InferResponse Data Model Parameters: Name Type Description Default response_id str The id of the inference response. required model_name str The name of the model. required infer_outputs List [ InferOutput ] The inference outputs of the inference response. required model_version Optional [ str ] The version of the model. None raw_outputs The raw binary data of the inference outputs. None from_grpc Optional [ bool ] Indicate if the InferResponse is constructed from a gRPC response. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None use_binary_outputs Optional [ bool ] A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. False requested_outputs Optional [ List [ RequestedOutput ]] The output tensors requested for this inference. None Source code in kserve/protocol/infer_type.py 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , use_binary_outputs : Optional [ bool ] = False , requested_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. use_binary_outputs: A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. requested_outputs: The output tensors requested for this inference. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc self . _requested_outputs = requested_outputs self . _use_binary_outputs : bool = use_binary_outputs if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output from_bytes ( res_bytes , json_length ) classmethod \u00b6 Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Parameters: Name Type Description Default res_bytes bytes The raw response bytes received from the REST API. required json_length int The length of the JSON part of the response. required Returns: Name Type Description InferResponse InferResponse The constructed InferResponse object. Raises: Type Description InvalidInput If the response format is unrecognized or if necessary fields are missing in the response. InferenceError if failed to set data for the output tensor. Source code in kserve/protocol/infer_type.py 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 @classmethod def from_bytes ( cls , res_bytes : bytes , json_length : int , ) -> \"InferResponse\" : \"\"\" Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Args: res_bytes (bytes): The raw response bytes received from the REST API. json_length (int): The length of the JSON part of the response. Returns: InferResponse: The constructed InferResponse object. Raises: InvalidInput: If the response format is unrecognized or if necessary fields are missing in the response. InferenceError: if failed to set data for the output tensor. \"\"\" # If json_length is equal to the length of the response bytes, then the response does not have # any appended binary data after the json. json_bytes = res_bytes [: json_length ] try : infer_res_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) model_name = infer_res_dict [ \"model_name\" ] infer_outputs = [] # Read the raw binary outputs appended after json start_index = json_length for output in infer_res_dict [ \"outputs\" ]: parameters = output . get ( \"parameters\" , None ) infer_output = InferOutput ( name = output [ \"name\" ], shape = output [ \"shape\" ], datatype = output [ \"datatype\" ], parameters = parameters , ) if parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" ) end_index = start_index + binary_data_size infer_output . _raw_data = res_bytes [ start_index : end_index ] infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = False ) start_index = end_index else : infer_output_data = output . get ( \"data\" , None ) if infer_output_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { model_name } '\" ) infer_output . data = infer_output_data infer_outputs . append ( infer_output ) return cls ( model_name = model_name , response_id = infer_res_dict . get ( \"id\" , None ), parameters = infer_res_dict . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) from_grpc ( response ) classmethod \u00b6 The class method to construct the InferResponse object from gRPC message type. Parameters: Name Type Description Default response ModelInferResponse The GRPC response as ModelInferResponse object. required Source code in kserve/protocol/infer_type.py 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type. Args: response: The GRPC response as ModelInferResponse object. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) from_rest ( response ) classmethod \u00b6 The class method to construct the InferResponse object from REST message type. Parameters: Name Type Description Default response Dict The response as a dict. required Source code in kserve/protocol/infer_type.py 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 @classmethod def from_rest ( cls , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type. Args: response: The response as a dict. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = response . get ( \"model_name\" ), model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) get_output_by_name ( name ) \u00b6 Find an output Tensor in the InferResponse that has the given name Parameters: Name Type Description Default name str name of the output Tensor object required Source code in kserve/protocol/infer_type.py 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 def get_output_by_name ( self , name : str ) -> Optional [ InferOutput ]: \"\"\"Find an output Tensor in the InferResponse that has the given name Args: name : str name of the output Tensor object Returns: InferOutput The InferOutput with the specified name, or None if no output with this name exists \"\"\" for infer_output in self . outputs : if name == infer_output . name : return infer_output return None to_grpc () \u00b6 Converts the InferResponse object to gRPC ModelInferResponse type. Returns: Type Description ModelInferResponse The ModelInferResponse gRPC message. Source code in kserve/protocol/infer_type.py 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. Raises: InvalidInput: If the output data is not a List or if the datatype is invalid. \"\"\" infer_outputs = [] raw_output_contents = [] use_raw_outputs = self . _use_binary_outputs if not self . _use_binary_outputs : # If FP16 datatype is present in the outputs use raw outputs. if _contains_fp16_datatype ( self ): use_raw_outputs = True for infer_output in self . outputs : if ( use_raw_outputs and infer_output . data and isinstance ( infer_output . data , list ) ): infer_output . data = infer_output . as_numpy () if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) to_rest () \u00b6 Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Type Description Tuple [ Union [ bytes , Dict ], Optional [ int ]] Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: Type Description InvalidInput If the output data is not a numpy array, bytes, or list. Source code in kserve/protocol/infer_type.py 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: InvalidInput: If the output data is not a numpy array, bytes, or list. \"\"\" infer_outputs = [] raw_outputs = [] use_binary_data = self . _use_binary_outputs outputs = self . _requested_outputs if self . _requested_outputs else self . outputs for output in outputs : infer_output = ( self . get_output_by_name ( output . name ) if self . _requested_outputs else output ) if self . _requested_outputs : use_binary_data = output . binary_data if infer_output is None : raise InvalidInput ( f \"Unexpected inference output ' { output . name } ' for model ' { self . model_name } '\" ) if infer_output . data is None and infer_output . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = use_binary_data ) elif infer_output . data or infer_output . _raw_data : infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = use_binary_data ) if infer_output . datatype == \"FP16\" and infer_output . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for output { infer_output . name } \" ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if use_binary_data : raw_outputs . append ( infer_output . _raw_data ) else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_outputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_outputs ) return infer_response_bytes , json_length return res , None RequestedOutput \u00b6 Source code in kserve/protocol/infer_type.py 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 class RequestedOutput : def __init__ ( self , name : str , parameters : Optional [ Dict ] = None ): \"\"\" The RequestedOutput class represents an output that is requested as part of an inference request. Args: name (str): The name of the output. parameters (Optional[Dict]): Additional parameters for the output. \"\"\" self . _name = name self . _parameters = parameters @property def name ( self ) -> str : \"\"\" Get the name of the output. Returns: str: The name of the output. \"\"\" return self . _name @property def parameters ( self ) -> Optional [ Dict ]: \"\"\" Get the additional parameters for the output. Returns: Optional[Dict]: The additional parameters for the output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] ): \"\"\"Set the parameters of the inference input associated with this object. Args: params: parameters of the inference input \"\"\" self . _parameters = params @property def binary_data ( self ) -> Optional [ bool ]: \"\"\"Get the binary_data attribute from the parameters. This attribute indicates whether the data for the input should be in binary format. Returns: bool or None: True if the data should be in binary format, False otherwise. If the binary_data attribute is not set, returns None. \"\"\" if self . parameters and \"binary_data\" in self . parameters : return self . parameters [ \"binary_data\" ] else : return None def __eq__ ( self , other ): if not isinstance ( other , RequestedOutput ): return False if self . name != other . name : return False if self . parameters != other . parameters : return False return True def __repr__ ( self ): return f \"RequestedOutput(name= { self . name } , parameters= { self . parameters } )\" def __str__ ( self ): return self . __repr__ () binary_data : Optional [ bool ] property \u00b6 Get the binary_data attribute from the parameters. This attribute indicates whether the data for the input should be in binary format. Returns: Type Description Optional [ bool ] bool or None: True if the data should be in binary format, False otherwise. If the binary_data attribute is not set, returns None. name : str property \u00b6 Get the name of the output. Returns: Name Type Description str str The name of the output. parameters : Optional [ Dict ] property writable \u00b6 Get the additional parameters for the output. Returns: Type Description Optional [ Dict ] Optional[Dict]: The additional parameters for the output. __init__ ( name , parameters = None ) \u00b6 The RequestedOutput class represents an output that is requested as part of an inference request. Parameters: Name Type Description Default name str The name of the output. required parameters Optional [ Dict ] Additional parameters for the output. None Source code in kserve/protocol/infer_type.py 408 409 410 411 412 413 414 415 416 417 def __init__ ( self , name : str , parameters : Optional [ Dict ] = None ): \"\"\" The RequestedOutput class represents an output that is requested as part of an inference request. Args: name (str): The name of the output. parameters (Optional[Dict]): Additional parameters for the output. \"\"\" self . _name = name self . _parameters = parameters deserialize_bytes_tensor ( encoded_tensor ) \u00b6 Deserializes an encoded bytes tensor into a numpy array of dtype of python objects Parameters: Name Type Description Default encoded_tensor bytes The encoded bytes tensor where each element has its length in first 4 bytes followed by the content required Source code in kserve/protocol/infer_type.py 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 def deserialize_bytes_tensor ( encoded_tensor : bytes ) -> np . ndarray : \"\"\" Deserializes an encoded bytes tensor into a numpy array of dtype of python objects Args: encoded_tensor : bytes The encoded bytes tensor where each element has its length in first 4 bytes followed by the content Returns: string_tensor : np.array The 1-D numpy array of type object containing the deserialized bytes in row-major form. \"\"\" strs = list () offset = 0 val_buf = encoded_tensor while offset < len ( val_buf ): length = struct . unpack_from ( \"<I\" , val_buf , offset )[ 0 ] offset += 4 sb = struct . unpack_from ( \"< {} s\" . format ( length ), val_buf , offset )[ 0 ] offset += length strs . append ( sb ) return np . array ( strs , dtype = np . object_ ) serialize_byte_tensor ( input_tensor ) \u00b6 Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object. For np.bytes, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Parameters: Name Type Description Default input_tensor np.array The bytes tensor to serialize. required Source code in kserve/protocol/infer_type.py 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 def serialize_byte_tensor ( input_tensor : np . ndarray ) -> np . ndarray : \"\"\" Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object. For np.bytes, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Args: input_tensor : np.array The bytes tensor to serialize. Returns: serialized_bytes_tensor : np.array The 1-D numpy array of type uint8 containing the serialized bytes in row-major form. Raises: InferenceError If unable to serialize the given tensor. \"\"\" if input_tensor . size == 0 : return np . empty ([ 0 ], dtype = np . object_ ) # If the input is a tensor of string/bytes objects, then must flatten those into # a 1-dimensional array containing the 4-byte byte size followed by the # actual element bytes. All elements are concatenated together in row-major # order. if ( input_tensor . dtype != np . object_ ) and ( input_tensor . dtype . type != np . bytes_ ): raise InferenceError ( \"cannot serialize bytes tensor: invalid datatype\" ) flattened_ls = [] # 'C' order is row-major. for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # If directly passing bytes to BYTES type, # don't convert it to str as Python will encode the # bytes which may distort the meaning if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : s = obj . item () else : s = str ( obj . item ()) . encode ( \"utf-8\" ) else : s = obj . item () flattened_ls . append ( struct . pack ( \"<I\" , len ( s ))) flattened_ls . append ( s ) flattened = b \"\" . join ( flattened_ls ) flattened_array = np . asarray ( flattened , dtype = np . object_ ) if not flattened_array . flags [ \"C_CONTIGUOUS\" ]: flattened_array = np . ascontiguousarray ( flattened_array , dtype = np . object_ ) return flattened_array to_grpc_parameters ( parameters ) \u00b6 Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. Source code in kserve/protocol/infer_type.py 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 def to_grpc_parameters ( parameters : Union [ Dict [ str , Union [ str , bool , int ]], MessageMap [ str , InferParameter ]] ) -> Dict [ str , InferParameter ]: \"\"\" Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. \"\"\" grpc_params : Dict [ str , InferParameter ] = {} for key , val in parameters . items (): if isinstance ( val , str ): grpc_params [ key ] = InferParameter ( string_param = val ) elif isinstance ( val , bool ): grpc_params [ key ] = InferParameter ( bool_param = val ) elif isinstance ( val , int ): grpc_params [ key ] = InferParameter ( int64_param = val ) elif isinstance ( val , InferParameter ): grpc_params [ key ] = val else : raise InvalidInput ( f \"to_grpc: invalid parameter value: { val } \" ) return grpc_params to_http_parameters ( parameters ) \u00b6 Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] Source code in kserve/protocol/infer_type.py 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 def to_http_parameters ( parameters : Union [ dict , MessageMap [ str , InferParameter ]] ) -> Dict [ str , Union [ str , bool , int ]]: \"\"\" Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] \"\"\" http_params : Dict [ str , Union [ str , bool , int ]] = {} for key , val in parameters . items (): if isinstance ( val , InferParameter ): if val . HasField ( \"bool_param\" ): http_params [ key ] = val . bool_param elif val . HasField ( \"int64_param\" ): http_params [ key ] = val . int64_param elif val . HasField ( \"string_param\" ): http_params [ key ] = val . string_param else : http_params [ key ] = val return http_params","title":"KServe Python Serving Runtime API"},{"location":"python_runtime_api/docs/api/#kserve-python-serving-runtime-api","text":"","title":"KServe Python Serving Runtime API"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer","text":"Source code in kserve/model_server.py 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 class ModelServer : def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : Optional [ ModelRepository ] = None , enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: A optional Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None``. it allows to override only the `uvicorn.access`'s format configuration with a richer set of fields (output hardcoded to `stdout`). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn [github issue](https://github.com/encode/uvicorn/issues/527) for more info). \"\"\" self . registered_models = ( ModelRepository () if registered_models is None else registered_models ) self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = self . registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension , kwargs = vars ( args ), ) if args . configure_logging : # If the logger does not have any handlers, then the logger is not configured. # For backward compatibility, we configure the logger here. if len ( logger . handlers ) == 0 : logging . configure_logging ( args . log_config_file ) self . access_log_format = access_log_format self . _custom_exception_handler = None async def _serve_rest ( self ): logger . info ( f \"Starting uvicorn with { self . workers } workers\" ) loop = asyncio . get_event_loop () if sys . platform not in [ \"win32\" , \"win64\" ]: sig_list = [ signal . SIGINT , signal . SIGTERM , signal . SIGQUIT ] else : sig_list = [ signal . SIGINT , signal . SIGTERM ] for sig in sig_list : loop . add_signal_handler ( sig , lambda s = sig : asyncio . create_task ( self . stop ( sig = s )) ) if self . _custom_exception_handler is None : loop . set_exception_handler ( self . default_exception_handler ) else : loop . set_exception_handler ( self . _custom_exception_handler ) self . _rest_server = UvicornServer ( app , self . http_port , self . dataplane , self . model_repository_extension , # By setting log_config to None we tell Uvicorn not to configure logging as it is already # configured by kserve. log_config = None , access_log_format = self . access_log_format , workers = self . workers , ) await self . _rest_server . run () def start ( self , models : List [ BaseKServeModel ]) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): at_least_one_model_ready = False for model in models : if isinstance ( model , BaseKServeModel ): if model . ready : at_least_one_model_ready = True self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging model . start () else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) if not at_least_one_model_ready and models : raise NoModelReady ( models ) else : raise RuntimeError ( \"Unknown model collection type\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def servers_task (): servers = [ self . _serve_rest ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" if \"exception\" in context : logger . error ( f \"Caught exception: { context . get ( 'exception' ) } \" ) logger . error ( f \"message: { context . get ( 'message' ) } \" ) loop . default_exception_handler ( context ) def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name )","title":"ModelServer"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.__init__","text":"KServe ModelServer Constructor Parameters: Name Type Description Default http_port int HTTP port. Default: 8080 . http_port grpc_port int GRPC port. Default: 8081 . grpc_port workers int Number of uvicorn workers. Default: 1 . workers max_threads int Max number of gRPC processing threads. Default: 4 max_threads max_asyncio_workers int Max number of AsyncIO threads. Default: None max_asyncio_workers registered_models Optional [ ModelRepository ] A optional Model repository with registered models. None enable_grpc bool Whether to turn on grpc server. Default: True enable_grpc enable_docs_url bool Whether to turn on /docs Swagger UI. Default: False . enable_docs_url enable_latency_logging bool Whether to log latency metric. Default: True . enable_latency_logging access_log_format str Format to set for the access log (provided by asgi-logger). Default: None . it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). access_log_format Source code in kserve/model_server.py 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : Optional [ ModelRepository ] = None , enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: A optional Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None``. it allows to override only the `uvicorn.access`'s format configuration with a richer set of fields (output hardcoded to `stdout`). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn [github issue](https://github.com/encode/uvicorn/issues/527) for more info). \"\"\" self . registered_models = ( ModelRepository () if registered_models is None else registered_models ) self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = self . registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension , kwargs = vars ( args ), ) if args . configure_logging : # If the logger does not have any handlers, then the logger is not configured. # For backward compatibility, we configure the logger here. if len ( logger . handlers ) == 0 : logging . configure_logging ( args . log_config_file ) self . access_log_format = access_log_format self . _custom_exception_handler = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.default_exception_handler","text":"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. Source code in kserve/model_server.py 341 342 343 344 345 346 347 348 349 350 351 352 def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" if \"exception\" in context : logger . error ( f \"Caught exception: { context . get ( 'exception' ) } \" ) logger . error ( f \"message: { context . get ( 'message' ) } \" ) loop . default_exception_handler ( context )","title":"default_exception_handler"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.register_exception_handler","text":"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see call_exception_handler() documentation for details about context). Source code in kserve/model_server.py 327 328 329 330 331 332 333 334 335 336 337 338 339 def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler","title":"register_exception_handler"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.register_model","text":"Register a model to the model server. Parameters: Name Type Description Default model BaseKServeModel The model object. required Source code in kserve/model_server.py 354 355 356 357 358 359 360 361 362 363 def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name )","title":"register_model"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.start","text":"Start the model server with a set of registered models. Parameters: Name Type Description Default models List [ BaseKServeModel ] a list of models to register to the model server. required Source code in kserve/model_server.py 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 def start ( self , models : List [ BaseKServeModel ]) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): at_least_one_model_ready = False for model in models : if isinstance ( model , BaseKServeModel ): if model . ready : at_least_one_model_ready = True self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging model . start () else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) if not at_least_one_model_ready and models : raise NoModelReady ( models ) else : raise RuntimeError ( \"Unknown model collection type\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def servers_task (): servers = [ self . _serve_rest ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ())","title":"start"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.stop","text":"Stop the instances of REST and gRPC model servers. Parameters: Name Type Description Default sig Optional [ int ] The signal to stop the server. Default: None . None Source code in kserve/model_server.py 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name )","title":"stop"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel","text":"Bases: ABC A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. Source code in kserve/model.py 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 class BaseKServeModel ( ABC ): \"\"\" A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. \"\"\" def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False async def healthy ( self ) -> bool : \"\"\" Check the health of this model. By default returns `self.ready`. Returns: True if healthy, false otherwise \"\"\" return self . ready def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready def start ( self ): \"\"\"Start handler can be overridden to perform model setup\"\"\" self . ready = True def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" self . ready = False","title":"BaseKServeModel"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.__init__","text":"Adds the required attributes Parameters: Name Type Description Default name str The name of the model. required Source code in kserve/model.py 49 50 51 52 53 54 55 56 57 def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.healthy","text":"Check the health of this model. By default returns self.ready . Returns: Type Description bool True if healthy, false otherwise Source code in kserve/model.py 59 60 61 62 63 64 65 66 async def healthy ( self ) -> bool : \"\"\" Check the health of this model. By default returns `self.ready`. Returns: True if healthy, false otherwise \"\"\" return self . ready","title":"healthy"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.load","text":"Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 68 69 70 71 72 73 74 75 76 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready","title":"load"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.start","text":"Start handler can be overridden to perform model setup Source code in kserve/model.py 78 79 80 def start ( self ): \"\"\"Start handler can be overridden to perform model setup\"\"\" self . ready = True","title":"start"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.stop","text":"Stop handler can be overridden to perform model teardown Source code in kserve/model.py 82 83 84 def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" self . ready = False","title":"stop"},{"location":"python_runtime_api/docs/api/#kserve.model.InferenceModel","text":"Bases: BaseKServeModel Abstract class representing a model that supports standard inference and prediction. Source code in kserve/model.py 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 class InferenceModel ( BaseKServeModel ): \"\"\" Abstract class representing a model that supports standard inference and prediction. \"\"\" @abstractmethod def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : pass def get_input_types ( self ) -> List [ Dict ]: # Override this function to return appropriate input format expected by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] def get_output_types ( self ) -> List [ Dict ]: # Override this function to return appropriate output format returned by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return []","title":"InferenceModel"},{"location":"python_runtime_api/docs/api/#kserve.model.Model","text":"Bases: InferenceModel Source code in kserve/model.py 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 class Model ( InferenceModel ): def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response @property def _http_client ( self ) -> InferenceRESTClient : if self . _http_client_instance is None and self . predictor_host : config = RESTConfig ( protocol = self . protocol , timeout = self . timeout , retries = 3 ) self . _http_client_instance = InferenceRESTClient ( config = config ) return self . _http_client_instance @property def _grpc_client ( self ) -> InferenceGRPCClient : if self . _grpc_client_stub is None and self . predictor_host : self . _grpc_client_stub = InferenceGRPCClient ( url = self . predictor_host , use_ssl = self . use_ssl , timeout = self . timeout ) return self . _grpc_client_stub def validate ( self , payload ): if isinstance ( payload , ModelInferRequest ): return payload if isinstance ( payload , InferRequest ): return payload # TODO: validate the request if self.get_input_types() defines the input types. if self . protocol == PredictorProtocol . REST_V2 . value : if \"inputs\" in payload and not isinstance ( payload [ \"inputs\" ], list ): raise InvalidInput ( 'Expected \"inputs\" to be a list' ) elif self . protocol == PredictorProtocol . REST_V1 . value : if ( isinstance ( payload , Dict ) and \"instances\" in payload and not isinstance ( payload [ \"instances\" ], list ) ): raise InvalidInput ( 'Expected \"instances\" to be a list' ) return payload def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result async def _http_predict ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: # Adjusting headers. Inject content type if not exist. # Also, removing host, as the header is the one passed to transformer and contains transformer's host predict_headers = { \"Content-Type\" : \"application/json\" } if headers is not None : if \"x-request-id\" in headers : predict_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : predict_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" predict_base_url = PREDICTOR_BASE_URL_FORMAT . format ( protocol , self . predictor_host ) response = await self . _http_client . infer ( predict_base_url , model_name = self . name , data = payload , headers = predict_headers , ) return response async def _grpc_predict ( self , payload : Union [ ModelInferRequest , InferRequest ], headers : Dict [ str , str ] = None , ) -> InferResponse : if isinstance ( payload , ModelInferRequest ): payload = InferRequest . from_grpc ( payload ) async_result = await self . _grpc_client . infer ( infer_request = payload , headers = ( ( \"request_type\" , \"grpc_v2\" ), ( \"response_type\" , \"grpc_v2\" ), ( \"x-request-id\" , headers . get ( \"x-request-id\" , \"\" )), ), ) return async_result async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : return await self . _grpc_predict ( payload , headers ) else : return await self . _http_predict ( payload , headers ) async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) explain_headers = { \"content-type\" : \"application/json\" } if headers is not None : if \"content-type\" in headers : explain_headers [ \"content-type\" ] = headers [ \"content-type\" ] if \"x-request-id\" in headers : explain_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : explain_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_base_url = EXPLAINER_BASE_URL_FORMAT . format ( protocol , self . explainer_host ) response = await self . _http_client . explain ( explain_base_url , model_name = self . name , data = payload , headers = explain_headers , ) return response","title":"Model"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.__call__","text":"Method to call predictor or explainer with the given input. Parameters: Name Type Description Default body Union [ Dict , CloudEvent , InferRequest ] Request body. required verb InferenceVerb The inference verb for predict/generate/explain PREDICT headers Optional [ Dict [ str , str ]] Request headers. None Returns: Type Description InferReturnType Response output from preprocess -> predict/generate/explain -> postprocess Source code in kserve/model.py 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response","title":"__call__"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.__init__","text":"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Parameters: Name Type Description Default name str The name of the model. required predictor_config Optional [ PredictorConfig ] The configurations for http call to the predictor. None Source code in kserve/model.py 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.explain","text":"explain handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if explainer_host is specified. Parameters: Name Type Description Default payload Dict Explainer model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Dict An Explanation for the inference result. Source code in kserve/model.py 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) explain_headers = { \"content-type\" : \"application/json\" } if headers is not None : if \"content-type\" in headers : explain_headers [ \"content-type\" ] = headers [ \"content-type\" ] if \"x-request-id\" in headers : explain_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : explain_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_base_url = EXPLAINER_BASE_URL_FORMAT . format ( protocol , self . explainer_host ) response = await self . _http_client . explain ( explain_base_url , model_name = self . name , data = payload , headers = explain_headers , ) return response","title":"explain"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.load","text":"Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 298 299 300 301 302 303 304 305 306 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready","title":"load"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.postprocess","text":"The postprocess handler can be overridden for inference result or response transformation. The predictor sends back the inference result in Dict for v1 endpoints and InferResponse for v2 endpoints. Parameters: Name Type Description Default result Union [ Dict , InferResponse ] The inference result passed from predict handler or the HTTP response from predictor. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse ] A Dict or InferResponse after post-process to return back to the client. Source code in kserve/model.py 325 326 327 328 329 330 331 332 333 334 335 336 337 338 async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result","title":"postprocess"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.predict","text":"The predict handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Parameters: Name Type Description Default payload Union [ Dict , InferRequest , ModelInferRequest ] Model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse , AsyncIterator [ Any ]] Inference result or a Response from the predictor. Source code in kserve/model.py 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : return await self . _grpc_predict ( payload , headers ) else : return await self . _http_predict ( payload , headers )","title":"predict"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.preprocess","text":"preprocess handler can be overridden for data or feature transformation. The model decodes the request body to Dict for v1 endpoints and InferRequest for v2 endpoints. Parameters: Name Type Description Default payload Union [ Dict , InferRequest ] Payload of the request. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferRequest ] A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Union [ Dict , InferRequest ] Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. Source code in kserve/model.py 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload","title":"preprocess"},{"location":"python_runtime_api/docs/api/#kserve.model.PredictorConfig","text":"Source code in kserve/model.py 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 class PredictorConfig : def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds","title":"PredictorConfig"},{"location":"python_runtime_api/docs/api/#kserve.model.PredictorConfig.__init__","text":"The configuration for the http call to the predictor Parameters: Name Type Description Default predictor_host str The host name of the predictor required predictor_protocol str The inference protocol used for predictor http call REST_V1 .value predictor_use_ssl bool Enable using ssl for http connection to the predictor False predictor_request_timeout_seconds int The request timeout seconds for the predictor http call 600 Source code in kserve/model.py 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput","text":"Source code in kserve/protocol/infer_type.py 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 class InferInput : _name : str _shape : List [ int ] _datatype : str _parameters : Dict def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference input associated with this object. Returns: The name of the inference input \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the datatype of inference input associated with this object. Returns: The datatype of the inference input. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of the inference input associated with this object. Returns: The data of the inference input. \"\"\" return self . _data @data . setter def data ( self , data : List ): \"\"\"Set the data of the inference input associated with this object. Args: data: data of the inference input. \"\"\" self . _data = data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference input associated with this object. Returns: The shape of the inference input. \"\"\" return self . _shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of the inference input associated with this object. Returns: The additional inference parameters \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] ): \"\"\"Set the parameters of the inference input associated with this object. Args: params: parameters of the inference input \"\"\" self . _parameters = params @shape . setter def shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference input. Args: shape : The shape of the associated inference input. \"\"\" self . _shape = shape def as_string ( self ) -> List [ List [ str ]]: \"\"\" Decodes the inference input data as a list of strings. Returns: List[List[str]]: The decoded data as a list of strings. Raises: InvalidInput: If the datatype of the inference input is not 'BYTES'. \"\"\" if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" ) def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data Raises: InvalidInput: If the datatype of the inference input is not recognized. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferInput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False if self . _raw_data != other . _raw_data : return False return True @parameters . setter def parameters ( self , value ): self . _parameters = value def to_dict ( self ) -> dict : return { \"name\" : self . name , \"shape\" : self . shape , \"datatype\" : self . datatype , \"data\" : self . data , \"parameters\" : self . parameters , } def __repr__ ( self ) -> str : return ( f '\"name\": \" { self . name } \",' f '\"shape\": { self . shape } ,' f '\"datatype\": \" { self . datatype } \",' f '\"data\": { self . data } ,' f '\"parameters\": { self . parameters } ' ) def __str__ ( self ) -> str : return self . __repr__ ()","title":"InferInput"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.data","text":"Get the data of the inference input associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of the inference input.","title":"data"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.datatype","text":"Get the datatype of inference input associated with this object. Returns: Type Description str The datatype of the inference input.","title":"datatype"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.name","text":"Get the name of inference input associated with this object. Returns: Type Description str The name of the inference input","title":"name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.parameters","text":"Get the parameters of the inference input associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters","title":"parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.shape","text":"Get the shape of inference input associated with this object. Returns: Type Description List [ int ] The shape of the inference input.","title":"shape"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.__init__","text":"An object of InferInput class is used to describe the input tensor of an inference request. Parameters: Name Type Description Default name str The name of the inference input whose data will be described by this object. required shape The shape of the associated inference input. required datatype The data type of the associated inference input. required data The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using set_data_from_numpy . None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.as_numpy","text":"Decode the inference input data as numpy array. Returns: Type Description ndarray A numpy array of the inference input data Source code in kserve/protocol/infer_type.py 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data Raises: InvalidInput: If the datatype of the inference input is not recognized. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape )","title":"as_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.as_string","text":"Decodes the inference input data as a list of strings. Returns: Type Description List [ List [ str ]] List[List[str]]: The decoded data as a list of strings. Raises: Type Description InvalidInput If the datatype of the inference input is not 'BYTES'. Source code in kserve/protocol/infer_type.py 220 221 222 223 224 225 226 227 228 229 230 231 232 233 def as_string ( self ) -> List [ List [ str ]]: \"\"\" Decodes the inference input data as a list of strings. Returns: List[List[str]]: The decoded data as a list of strings. Raises: InvalidInput: If the datatype of the inference input is not 'BYTES'. \"\"\" if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" )","title":"as_string"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.set_data_from_numpy","text":"Set the tensor data from the specified numpy array for input associated with this object. Parameters: Name Type Description Default input_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data )","title":"set_data_from_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput","text":"Source code in kserve/protocol/infer_type.py 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 class InferOutput : def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference output associated with this object. Returns: The name of inference output. \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the data type of inference output associated with this object. Returns: The data type of inference output. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of inference output associated with this object. Returns: The data of inference output. \"\"\" return self . _data @data . setter def data ( self , data : Union [ List , np . ndarray , InferTensorContents ]): \"\"\"Set the data of inference output associated with this object. Args: data: inference output data. \"\"\" self . _data = data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference output associated with this object. Returns: The shape of inference output \"\"\" return self . _shape @shape . setter def shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference output. Args: shape: The shape of the associated inference output. \"\"\" self . _shape = shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of inference output associated with this object. Returns: The additional inference parameters associated with the inference output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Union [ Dict , MessageMap [ str , InferParameter ]]): \"\"\"Set the parameters of inference output associated with this object. :param params: The parameters of inference output \"\"\" self . _parameters = params def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferOutput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False if self . _raw_data != other . _raw_data : return False return True def to_dict ( self ) -> dict : return { \"name\" : self . name , \"shape\" : self . shape , \"datatype\" : self . datatype , \"data\" : self . data , \"parameters\" : self . parameters , } def __repr__ ( self ) -> str : return ( f '\"name\": \" { self . name } \",' f '\"shape\": { self . shape } ,' f '\"datatype\": \" { self . datatype } \",' f '\"data\": { self . data } ,' f '\"parameters\": { self . parameters } ' ) def __str__ ( self ) -> str : return self . __repr__ ()","title":"InferOutput"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.data","text":"Get the data of inference output associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of inference output.","title":"data"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.datatype","text":"Get the data type of inference output associated with this object. Returns: Type Description str The data type of inference output.","title":"datatype"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.name","text":"Get the name of inference output associated with this object. Returns: Type Description str The name of inference output.","title":"name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.parameters","text":"Get the parameters of inference output associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters associated with the inference output.","title":"parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.shape","text":"Get the shape of inference output associated with this object. Returns: Type Description List [ int ] The shape of inference output","title":"shape"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.__init__","text":"An object of InferOutput class is used to describe the output tensor for an inference response. Parameters: Name Type Description Default name The name of inference output whose data will be described by this object. required shape The shape of the associated inference output. required datatype The data type of the associated inference output. required data The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.as_numpy","text":"Decode the tensor output data as numpy array. Returns: Type Description ndarray The numpy array of the associated inference output data. Source code in kserve/protocol/infer_type.py 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape )","title":"as_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.set_data_from_numpy","text":"Set the tensor data from the specified numpy array for the inference output associated with this object. Parameters: Name Type Description Default output_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data )","title":"set_data_from_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest","text":"Source code in kserve/protocol/infer_type.py 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 class InferRequest : id : Optional [ str ] model_name : str parameters : Optional [ Dict ] inputs : List [ InferInput ] from_grpc : bool def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , request_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. request_outputs: The output tensors requested for this inference. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc self . _use_raw_outputs = False if raw_inputs : self . _use_raw_outputs = True for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input self . request_outputs = request_outputs @property def use_binary_outputs ( self ) -> bool : \"\"\"This attribute is used to determine if all the outputs should be returned as raw binary format. For REST, Get the binary_data_output attribute from the parameters. This will be ovverided by the individual output's 'binary_data' parameter. For GRPC, It is True, if the received inputs are raw_inputs, otherwise False. For GRPC, if the inputs are raw_inputs, then the outputs should be returned as raw_outputs. Returns: a boolean indicating whether to use binary raw outputs \"\"\" # If the request is from gRPC and we receive the inputs as raw inputs, then the outputs should be returned as raw binary format. if self . _use_raw_outputs and self . from_grpc : return True # If the request is from REST and the 'use_binary_outputs' parameter is set to True, then the outputs should be returned as raw binary format. elif self . parameters and not self . from_grpc : # If it is a grpc request, then this configuration has no effect on the output. return self . parameters . get ( \"binary_data_output\" , False ) else : return False @classmethod def from_grpc ( cls , request : ModelInferRequest ) -> \"InferRequest\" : \"\"\" Class method to construct an InferRequest object from a ModelInferRequest object. Args: request (ModelInferRequest): The gRPC ModelInferRequest object to be converted. Returns: InferRequest: The resulting InferRequest object. \"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = ( to_http_parameters ( input_tensor . parameters ) if input_tensor . parameters else None ), ) for input_tensor in request . inputs ] request_outputs = [ RequestedOutput ( name = output . name , parameters = ( to_http_parameters ( output . parameters ) if output . parameters else None ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , request_outputs = request_outputs if request_outputs else None , ) @classmethod def from_bytes ( cls , req_bytes : bytes , json_length : int , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from REST raw request bytes. Args: req_bytes (bytes): The raw InferRequest in bytes. json_length (int): The length of the json bytes. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized or if necessary fields are missing. \"\"\" json_bytes = req_bytes [: json_length ] try : infer_req_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) infer_inputs = [] # Read the raw binary inputs appended after json start_index = json_length for input_ in infer_req_dict [ \"inputs\" ]: parameters = input_ . get ( \"parameters\" , None ) infer_input = InferInput ( name = input_ [ \"name\" ], shape = input_ [ \"shape\" ], datatype = input_ [ \"datatype\" ], parameters = parameters , ) infer_input_data = input_ . get ( \"data\" , None ) if infer_input_data is not None : if infer_input . datatype == \"FP16\" : raise InvalidInput ( f \"Receiving FP16 data via JSON is not supported. Please use the binary data format \" f \"for input { infer_input . name } \" ) infer_input . data = infer_input_data elif parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" , None ) if binary_data_size is None : raise InvalidInput ( f \"'binary_data_size' is not specified for input ' { infer_input . name } ' for model ' { model_name } '\" ) end_index = start_index + binary_data_size infer_input . _raw_data = req_bytes [ start_index : end_index ] infer_input . set_data_from_numpy ( infer_input . as_numpy (), binary_data = False ) start_index = end_index else : raise InvalidInput ( f \"'data' field is missing for input ' { infer_input . name } ' for model ' { model_name } '\" ) infer_inputs . append ( infer_input ) requested_outputs = None if infer_req_dict . get ( \"outputs\" , None ) is not None : requested_outputs = [ RequestedOutput ( name = output [ \"name\" ], parameters = output . get ( \"parameters\" , None ), ) for output in infer_req_dict [ \"outputs\" ] ] return cls ( model_name = model_name , request_id = infer_req_dict . get ( \"id\" , None ), parameters = infer_req_dict . get ( \"parameters\" , None ), infer_inputs = infer_inputs , request_outputs = requested_outputs , ) @classmethod def from_inference_request ( cls , request : InferenceRequest , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from InferenceRequest object. Args: request (InferenceRequest): The InferenceRequest object. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized. \"\"\" infer_inputs = [] for infer_input in request . inputs : if infer_input . datatype == \"FP16\" and len ( infer_input . data ) != 0 : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. \" f \"Please use the binary data format for input { infer_input . name } \" ) infer_inputs . append ( InferInput ( name = infer_input . name , shape = infer_input . shape , datatype = infer_input . datatype , data = infer_input . data , parameters = ( {} if infer_input . parameters is None else infer_input . parameters ), ) ) requested_outputs = None if request . outputs : requested_outputs = [ RequestedOutput ( name = output . name , parameters = ({} if output . parameters is None else output . parameters ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = model_name , infer_inputs = infer_inputs , parameters = request . parameters , request_outputs = requested_outputs , ) def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: InvalidInput: If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. \"\"\" infer_inputs = [] raw_inputs = [] for infer_input in self . inputs : if infer_input . data is None and infer_input . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_input . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) if infer_input . data and infer_input . _raw_data : raise InvalidInput ( f \"Both 'data' and 'raw_data' fields are set for input ' { infer_input . name } ' for model ' { self . model_name } '\" ) if infer_input . datatype == \"FP16\" and infer_input . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for input { infer_input . name } \" ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if infer_input . _raw_data : raw_inputs . append ( infer_input . _raw_data ) else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) requested_outputs = [] if self . request_outputs : for requested_output in self . request_outputs : requested_output_dict = { \"name\" : requested_output . name , } if requested_output . parameters : requested_output_dict [ \"parameters\" ] = to_http_parameters ( requested_output . parameters ) requested_outputs . append ( requested_output_dict ) res = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"model_name\" : self . model_name , \"inputs\" : infer_inputs , } if requested_outputs : res [ \"outputs\" ] = requested_outputs if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_inputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_inputs ) return infer_response_bytes , json_length return res , None def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: ModelInferRequest gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) request_outputs = [] if self . request_outputs : for request_output in self . request_outputs : request_output_dict = { \"name\" : request_output . name } if request_output . parameters : request_output_dict [ \"parameters\" ] = to_grpc_parameters ( request_output . parameters ) request_outputs . append ( request_output_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , outputs = request_outputs if request_outputs else None , ) def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) def get_input_by_name ( self , name : str ) -> Optional [ InferInput ]: \"\"\"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists \"\"\" for infer_input in self . inputs : if name == infer_input . name : return infer_input return None def __eq__ ( self , other ): if not isinstance ( other , InferRequest ): return False if self . model_name != other . model_name : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . inputs != other . inputs : return False if self . request_outputs != other . request_outputs : return False return True def to_dict ( self ) -> dict : return { \"id\" : self . id , \"model_name\" : self . model_name , \"inputs\" : [ infer_input . to_dict () for infer_input in self . inputs ], \"parameters\" : self . parameters , \"from_grpc\" : self . from_grpc , } def __repr__ ( self ) -> str : return ( f '\"id\": \" { self . id } \",' f '\"model_name\": \" { self . model_name } \",' f '\"inputs\": { self . inputs . __repr__ () } ,' f '\"parameters\": { self . parameters } ,' f '\"from_grpc\": { self . from_grpc } ' ) def __str__ ( self ) -> str : return self . __repr__ ()","title":"InferRequest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.use_binary_outputs","text":"This attribute is used to determine if all the outputs should be returned as raw binary format. For REST, Get the binary_data_output attribute from the parameters. This will be ovverided by the individual output's 'binary_data' parameter. For GRPC, It is True, if the received inputs are raw_inputs, otherwise False. For GRPC, if the inputs are raw_inputs, then the outputs should be returned as raw_outputs. Returns: Type Description bool a boolean indicating whether to use binary raw outputs","title":"use_binary_outputs"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.__init__","text":"InferRequest Data Model. Parameters: Name Type Description Default model_name str The model name. required infer_inputs List [ InferInput ] The inference inputs for the model. required request_id Optional [ str ] The id for the inference request. None raw_inputs The binary data for the inference inputs. None from_grpc Optional [ bool ] Indicate if the data model is constructed from gRPC request. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None request_outputs Optional [ List [ RequestedOutput ]] The output tensors requested for this inference. None Source code in kserve/protocol/infer_type.py 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , request_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. request_outputs: The output tensors requested for this inference. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc self . _use_raw_outputs = False if raw_inputs : self . _use_raw_outputs = True for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input self . request_outputs = request_outputs","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.as_dataframe","text":"Decode the tensor inputs as pandas dataframe. Returns: Type Description DataFrame The inference input data as pandas dataframe Source code in kserve/protocol/infer_type.py 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 )","title":"as_dataframe"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.from_bytes","text":"The class method to construct the InferRequest object from REST raw request bytes. Parameters: Name Type Description Default req_bytes bytes The raw InferRequest in bytes. required json_length int The length of the json bytes. required model_name str The name of the model. required Returns: Name Type Description InferRequest InferRequest The resulting InferRequest object. Raises: Type Description InvalidInput If the request format is unrecognized or if necessary fields are missing. Source code in kserve/protocol/infer_type.py 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 @classmethod def from_bytes ( cls , req_bytes : bytes , json_length : int , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from REST raw request bytes. Args: req_bytes (bytes): The raw InferRequest in bytes. json_length (int): The length of the json bytes. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized or if necessary fields are missing. \"\"\" json_bytes = req_bytes [: json_length ] try : infer_req_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) infer_inputs = [] # Read the raw binary inputs appended after json start_index = json_length for input_ in infer_req_dict [ \"inputs\" ]: parameters = input_ . get ( \"parameters\" , None ) infer_input = InferInput ( name = input_ [ \"name\" ], shape = input_ [ \"shape\" ], datatype = input_ [ \"datatype\" ], parameters = parameters , ) infer_input_data = input_ . get ( \"data\" , None ) if infer_input_data is not None : if infer_input . datatype == \"FP16\" : raise InvalidInput ( f \"Receiving FP16 data via JSON is not supported. Please use the binary data format \" f \"for input { infer_input . name } \" ) infer_input . data = infer_input_data elif parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" , None ) if binary_data_size is None : raise InvalidInput ( f \"'binary_data_size' is not specified for input ' { infer_input . name } ' for model ' { model_name } '\" ) end_index = start_index + binary_data_size infer_input . _raw_data = req_bytes [ start_index : end_index ] infer_input . set_data_from_numpy ( infer_input . as_numpy (), binary_data = False ) start_index = end_index else : raise InvalidInput ( f \"'data' field is missing for input ' { infer_input . name } ' for model ' { model_name } '\" ) infer_inputs . append ( infer_input ) requested_outputs = None if infer_req_dict . get ( \"outputs\" , None ) is not None : requested_outputs = [ RequestedOutput ( name = output [ \"name\" ], parameters = output . get ( \"parameters\" , None ), ) for output in infer_req_dict [ \"outputs\" ] ] return cls ( model_name = model_name , request_id = infer_req_dict . get ( \"id\" , None ), parameters = infer_req_dict . get ( \"parameters\" , None ), infer_inputs = infer_inputs , request_outputs = requested_outputs , )","title":"from_bytes"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.from_grpc","text":"Class method to construct an InferRequest object from a ModelInferRequest object. Parameters: Name Type Description Default request ModelInferRequest The gRPC ModelInferRequest object to be converted. required Returns: Name Type Description InferRequest InferRequest The resulting InferRequest object. Source code in kserve/protocol/infer_type.py 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 @classmethod def from_grpc ( cls , request : ModelInferRequest ) -> \"InferRequest\" : \"\"\" Class method to construct an InferRequest object from a ModelInferRequest object. Args: request (ModelInferRequest): The gRPC ModelInferRequest object to be converted. Returns: InferRequest: The resulting InferRequest object. \"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = ( to_http_parameters ( input_tensor . parameters ) if input_tensor . parameters else None ), ) for input_tensor in request . inputs ] request_outputs = [ RequestedOutput ( name = output . name , parameters = ( to_http_parameters ( output . parameters ) if output . parameters else None ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , request_outputs = request_outputs if request_outputs else None , )","title":"from_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.from_inference_request","text":"The class method to construct the InferRequest object from InferenceRequest object. Parameters: Name Type Description Default request InferenceRequest The InferenceRequest object. required model_name str The name of the model. required Source code in kserve/protocol/infer_type.py 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 @classmethod def from_inference_request ( cls , request : InferenceRequest , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from InferenceRequest object. Args: request (InferenceRequest): The InferenceRequest object. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized. \"\"\" infer_inputs = [] for infer_input in request . inputs : if infer_input . datatype == \"FP16\" and len ( infer_input . data ) != 0 : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. \" f \"Please use the binary data format for input { infer_input . name } \" ) infer_inputs . append ( InferInput ( name = infer_input . name , shape = infer_input . shape , datatype = infer_input . datatype , data = infer_input . data , parameters = ( {} if infer_input . parameters is None else infer_input . parameters ), ) ) requested_outputs = None if request . outputs : requested_outputs = [ RequestedOutput ( name = output . name , parameters = ({} if output . parameters is None else output . parameters ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = model_name , infer_inputs = infer_inputs , parameters = request . parameters , request_outputs = requested_outputs , )","title":"from_inference_request"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.get_input_by_name","text":"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists Source code in kserve/protocol/infer_type.py 860 861 862 863 864 865 866 867 868 869 870 871 872 873 def get_input_by_name ( self , name : str ) -> Optional [ InferInput ]: \"\"\"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists \"\"\" for infer_input in self . inputs : if name == infer_input . name : return infer_input return None","title":"get_input_by_name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.to_grpc","text":"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: Type Description ModelInferRequest ModelInferRequest gRPC type converted from InferRequest object. Source code in kserve/protocol/infer_type.py 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: ModelInferRequest gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) request_outputs = [] if self . request_outputs : for request_output in self . request_outputs : request_output_dict = { \"name\" : request_output . name } if request_output . parameters : request_output_dict [ \"parameters\" ] = to_grpc_parameters ( request_output . parameters ) request_outputs . append ( request_output_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , outputs = request_outputs if request_outputs else None , )","title":"to_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.to_rest","text":"Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Type Description Tuple [ Union [ bytes , Dict ], Optional [ int ]] Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: Type Description InvalidInput If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. Source code in kserve/protocol/infer_type.py 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: InvalidInput: If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. \"\"\" infer_inputs = [] raw_inputs = [] for infer_input in self . inputs : if infer_input . data is None and infer_input . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_input . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) if infer_input . data and infer_input . _raw_data : raise InvalidInput ( f \"Both 'data' and 'raw_data' fields are set for input ' { infer_input . name } ' for model ' { self . model_name } '\" ) if infer_input . datatype == \"FP16\" and infer_input . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for input { infer_input . name } \" ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if infer_input . _raw_data : raw_inputs . append ( infer_input . _raw_data ) else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) requested_outputs = [] if self . request_outputs : for requested_output in self . request_outputs : requested_output_dict = { \"name\" : requested_output . name , } if requested_output . parameters : requested_output_dict [ \"parameters\" ] = to_http_parameters ( requested_output . parameters ) requested_outputs . append ( requested_output_dict ) res = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"model_name\" : self . model_name , \"inputs\" : infer_inputs , } if requested_outputs : res [ \"outputs\" ] = requested_outputs if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_inputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_inputs ) return infer_response_bytes , json_length return res , None","title":"to_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse","text":"Source code in kserve/protocol/infer_type.py 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 class InferResponse : id : str model_name : str model_version : Optional [ str ] parameters : Optional [ Dict ] outputs : List [ InferOutput ] from_grpc : bool def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , use_binary_outputs : Optional [ bool ] = False , requested_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. use_binary_outputs: A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. requested_outputs: The output tensors requested for this inference. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc self . _requested_outputs = requested_outputs self . _use_binary_outputs : bool = use_binary_outputs if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type. Args: response: The GRPC response as ModelInferResponse object. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) @classmethod def from_rest ( cls , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type. Args: response: The response as a dict. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = response . get ( \"model_name\" ), model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) @classmethod def from_bytes ( cls , res_bytes : bytes , json_length : int , ) -> \"InferResponse\" : \"\"\" Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Args: res_bytes (bytes): The raw response bytes received from the REST API. json_length (int): The length of the JSON part of the response. Returns: InferResponse: The constructed InferResponse object. Raises: InvalidInput: If the response format is unrecognized or if necessary fields are missing in the response. InferenceError: if failed to set data for the output tensor. \"\"\" # If json_length is equal to the length of the response bytes, then the response does not have # any appended binary data after the json. json_bytes = res_bytes [: json_length ] try : infer_res_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) model_name = infer_res_dict [ \"model_name\" ] infer_outputs = [] # Read the raw binary outputs appended after json start_index = json_length for output in infer_res_dict [ \"outputs\" ]: parameters = output . get ( \"parameters\" , None ) infer_output = InferOutput ( name = output [ \"name\" ], shape = output [ \"shape\" ], datatype = output [ \"datatype\" ], parameters = parameters , ) if parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" ) end_index = start_index + binary_data_size infer_output . _raw_data = res_bytes [ start_index : end_index ] infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = False ) start_index = end_index else : infer_output_data = output . get ( \"data\" , None ) if infer_output_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { model_name } '\" ) infer_output . data = infer_output_data infer_outputs . append ( infer_output ) return cls ( model_name = model_name , response_id = infer_res_dict . get ( \"id\" , None ), parameters = infer_res_dict . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: InvalidInput: If the output data is not a numpy array, bytes, or list. \"\"\" infer_outputs = [] raw_outputs = [] use_binary_data = self . _use_binary_outputs outputs = self . _requested_outputs if self . _requested_outputs else self . outputs for output in outputs : infer_output = ( self . get_output_by_name ( output . name ) if self . _requested_outputs else output ) if self . _requested_outputs : use_binary_data = output . binary_data if infer_output is None : raise InvalidInput ( f \"Unexpected inference output ' { output . name } ' for model ' { self . model_name } '\" ) if infer_output . data is None and infer_output . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = use_binary_data ) elif infer_output . data or infer_output . _raw_data : infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = use_binary_data ) if infer_output . datatype == \"FP16\" and infer_output . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for output { infer_output . name } \" ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if use_binary_data : raw_outputs . append ( infer_output . _raw_data ) else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_outputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_outputs ) return infer_response_bytes , json_length return res , None def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. Raises: InvalidInput: If the output data is not a List or if the datatype is invalid. \"\"\" infer_outputs = [] raw_output_contents = [] use_raw_outputs = self . _use_binary_outputs if not self . _use_binary_outputs : # If FP16 datatype is present in the outputs use raw outputs. if _contains_fp16_datatype ( self ): use_raw_outputs = True for infer_output in self . outputs : if ( use_raw_outputs and infer_output . data and isinstance ( infer_output . data , list ) ): infer_output . data = infer_output . as_numpy () if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) def get_output_by_name ( self , name : str ) -> Optional [ InferOutput ]: \"\"\"Find an output Tensor in the InferResponse that has the given name Args: name : str name of the output Tensor object Returns: InferOutput The InferOutput with the specified name, or None if no output with this name exists \"\"\" for infer_output in self . outputs : if name == infer_output . name : return infer_output return None def __eq__ ( self , other ): if not isinstance ( other , InferResponse ): return False if self . model_name != other . model_name : return False if self . model_version != other . model_version : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . outputs != other . outputs : return False return True def to_dict ( self ) -> dict : return { \"id\" : self . id , \"model_name\" : self . model_name , \"outputs\" : [ infer_output . to_dict () for infer_output in self . outputs ], \"parameters\" : self . parameters , \"from_grpc\" : self . from_grpc , } def __repr__ ( self ) -> str : return ( f '\"id\": \" { self . id } \",' f '\"model_name\": \" { self . model_name } \",' f '\"outputs\": { self . outputs . __repr__ () } ,' f '\"parameters\": { self . parameters } ,' f '\"from_grpc\": { self . from_grpc } ' ) def __str__ ( self ) -> str : return self . __repr__ ()","title":"InferResponse"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.__init__","text":"The InferResponse Data Model Parameters: Name Type Description Default response_id str The id of the inference response. required model_name str The name of the model. required infer_outputs List [ InferOutput ] The inference outputs of the inference response. required model_version Optional [ str ] The version of the model. None raw_outputs The raw binary data of the inference outputs. None from_grpc Optional [ bool ] Indicate if the InferResponse is constructed from a gRPC response. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None use_binary_outputs Optional [ bool ] A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. False requested_outputs Optional [ List [ RequestedOutput ]] The output tensors requested for this inference. None Source code in kserve/protocol/infer_type.py 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , use_binary_outputs : Optional [ bool ] = False , requested_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. use_binary_outputs: A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. requested_outputs: The output tensors requested for this inference. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc self . _requested_outputs = requested_outputs self . _use_binary_outputs : bool = use_binary_outputs if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.from_bytes","text":"Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Parameters: Name Type Description Default res_bytes bytes The raw response bytes received from the REST API. required json_length int The length of the JSON part of the response. required Returns: Name Type Description InferResponse InferResponse The constructed InferResponse object. Raises: Type Description InvalidInput If the response format is unrecognized or if necessary fields are missing in the response. InferenceError if failed to set data for the output tensor. Source code in kserve/protocol/infer_type.py 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 @classmethod def from_bytes ( cls , res_bytes : bytes , json_length : int , ) -> \"InferResponse\" : \"\"\" Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Args: res_bytes (bytes): The raw response bytes received from the REST API. json_length (int): The length of the JSON part of the response. Returns: InferResponse: The constructed InferResponse object. Raises: InvalidInput: If the response format is unrecognized or if necessary fields are missing in the response. InferenceError: if failed to set data for the output tensor. \"\"\" # If json_length is equal to the length of the response bytes, then the response does not have # any appended binary data after the json. json_bytes = res_bytes [: json_length ] try : infer_res_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) model_name = infer_res_dict [ \"model_name\" ] infer_outputs = [] # Read the raw binary outputs appended after json start_index = json_length for output in infer_res_dict [ \"outputs\" ]: parameters = output . get ( \"parameters\" , None ) infer_output = InferOutput ( name = output [ \"name\" ], shape = output [ \"shape\" ], datatype = output [ \"datatype\" ], parameters = parameters , ) if parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" ) end_index = start_index + binary_data_size infer_output . _raw_data = res_bytes [ start_index : end_index ] infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = False ) start_index = end_index else : infer_output_data = output . get ( \"data\" , None ) if infer_output_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { model_name } '\" ) infer_output . data = infer_output_data infer_outputs . append ( infer_output ) return cls ( model_name = model_name , response_id = infer_res_dict . get ( \"id\" , None ), parameters = infer_res_dict . get ( \"parameters\" , None ), infer_outputs = infer_outputs , )","title":"from_bytes"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.from_grpc","text":"The class method to construct the InferResponse object from gRPC message type. Parameters: Name Type Description Default response ModelInferResponse The GRPC response as ModelInferResponse object. required Source code in kserve/protocol/infer_type.py 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type. Args: response: The GRPC response as ModelInferResponse object. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , )","title":"from_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.from_rest","text":"The class method to construct the InferResponse object from REST message type. Parameters: Name Type Description Default response Dict The response as a dict. required Source code in kserve/protocol/infer_type.py 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 @classmethod def from_rest ( cls , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type. Args: response: The response as a dict. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = response . get ( \"model_name\" ), model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , )","title":"from_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.get_output_by_name","text":"Find an output Tensor in the InferResponse that has the given name Parameters: Name Type Description Default name str name of the output Tensor object required Source code in kserve/protocol/infer_type.py 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 def get_output_by_name ( self , name : str ) -> Optional [ InferOutput ]: \"\"\"Find an output Tensor in the InferResponse that has the given name Args: name : str name of the output Tensor object Returns: InferOutput The InferOutput with the specified name, or None if no output with this name exists \"\"\" for infer_output in self . outputs : if name == infer_output . name : return infer_output return None","title":"get_output_by_name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.to_grpc","text":"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: Type Description ModelInferResponse The ModelInferResponse gRPC message. Source code in kserve/protocol/infer_type.py 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. Raises: InvalidInput: If the output data is not a List or if the datatype is invalid. \"\"\" infer_outputs = [] raw_output_contents = [] use_raw_outputs = self . _use_binary_outputs if not self . _use_binary_outputs : # If FP16 datatype is present in the outputs use raw outputs. if _contains_fp16_datatype ( self ): use_raw_outputs = True for infer_output in self . outputs : if ( use_raw_outputs and infer_output . data and isinstance ( infer_output . data , list ) ): infer_output . data = infer_output . as_numpy () if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , )","title":"to_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.to_rest","text":"Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Type Description Tuple [ Union [ bytes , Dict ], Optional [ int ]] Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: Type Description InvalidInput If the output data is not a numpy array, bytes, or list. Source code in kserve/protocol/infer_type.py 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: InvalidInput: If the output data is not a numpy array, bytes, or list. \"\"\" infer_outputs = [] raw_outputs = [] use_binary_data = self . _use_binary_outputs outputs = self . _requested_outputs if self . _requested_outputs else self . outputs for output in outputs : infer_output = ( self . get_output_by_name ( output . name ) if self . _requested_outputs else output ) if self . _requested_outputs : use_binary_data = output . binary_data if infer_output is None : raise InvalidInput ( f \"Unexpected inference output ' { output . name } ' for model ' { self . model_name } '\" ) if infer_output . data is None and infer_output . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = use_binary_data ) elif infer_output . data or infer_output . _raw_data : infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = use_binary_data ) if infer_output . datatype == \"FP16\" and infer_output . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for output { infer_output . name } \" ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if use_binary_data : raw_outputs . append ( infer_output . _raw_data ) else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_outputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_outputs ) return infer_response_bytes , json_length return res , None","title":"to_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput","text":"Source code in kserve/protocol/infer_type.py 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 class RequestedOutput : def __init__ ( self , name : str , parameters : Optional [ Dict ] = None ): \"\"\" The RequestedOutput class represents an output that is requested as part of an inference request. Args: name (str): The name of the output. parameters (Optional[Dict]): Additional parameters for the output. \"\"\" self . _name = name self . _parameters = parameters @property def name ( self ) -> str : \"\"\" Get the name of the output. Returns: str: The name of the output. \"\"\" return self . _name @property def parameters ( self ) -> Optional [ Dict ]: \"\"\" Get the additional parameters for the output. Returns: Optional[Dict]: The additional parameters for the output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] ): \"\"\"Set the parameters of the inference input associated with this object. Args: params: parameters of the inference input \"\"\" self . _parameters = params @property def binary_data ( self ) -> Optional [ bool ]: \"\"\"Get the binary_data attribute from the parameters. This attribute indicates whether the data for the input should be in binary format. Returns: bool or None: True if the data should be in binary format, False otherwise. If the binary_data attribute is not set, returns None. \"\"\" if self . parameters and \"binary_data\" in self . parameters : return self . parameters [ \"binary_data\" ] else : return None def __eq__ ( self , other ): if not isinstance ( other , RequestedOutput ): return False if self . name != other . name : return False if self . parameters != other . parameters : return False return True def __repr__ ( self ): return f \"RequestedOutput(name= { self . name } , parameters= { self . parameters } )\" def __str__ ( self ): return self . __repr__ ()","title":"RequestedOutput"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput.binary_data","text":"Get the binary_data attribute from the parameters. This attribute indicates whether the data for the input should be in binary format. Returns: Type Description Optional [ bool ] bool or None: True if the data should be in binary format, False otherwise. If the binary_data attribute is not set, returns None.","title":"binary_data"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput.name","text":"Get the name of the output. Returns: Name Type Description str str The name of the output.","title":"name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput.parameters","text":"Get the additional parameters for the output. Returns: Type Description Optional [ Dict ] Optional[Dict]: The additional parameters for the output.","title":"parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput.__init__","text":"The RequestedOutput class represents an output that is requested as part of an inference request. Parameters: Name Type Description Default name str The name of the output. required parameters Optional [ Dict ] Additional parameters for the output. None Source code in kserve/protocol/infer_type.py 408 409 410 411 412 413 414 415 416 417 def __init__ ( self , name : str , parameters : Optional [ Dict ] = None ): \"\"\" The RequestedOutput class represents an output that is requested as part of an inference request. Args: name (str): The name of the output. parameters (Optional[Dict]): Additional parameters for the output. \"\"\" self . _name = name self . _parameters = parameters","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.deserialize_bytes_tensor","text":"Deserializes an encoded bytes tensor into a numpy array of dtype of python objects Parameters: Name Type Description Default encoded_tensor bytes The encoded bytes tensor where each element has its length in first 4 bytes followed by the content required Source code in kserve/protocol/infer_type.py 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 def deserialize_bytes_tensor ( encoded_tensor : bytes ) -> np . ndarray : \"\"\" Deserializes an encoded bytes tensor into a numpy array of dtype of python objects Args: encoded_tensor : bytes The encoded bytes tensor where each element has its length in first 4 bytes followed by the content Returns: string_tensor : np.array The 1-D numpy array of type object containing the deserialized bytes in row-major form. \"\"\" strs = list () offset = 0 val_buf = encoded_tensor while offset < len ( val_buf ): length = struct . unpack_from ( \"<I\" , val_buf , offset )[ 0 ] offset += 4 sb = struct . unpack_from ( \"< {} s\" . format ( length ), val_buf , offset )[ 0 ] offset += length strs . append ( sb ) return np . array ( strs , dtype = np . object_ )","title":"deserialize_bytes_tensor"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.serialize_byte_tensor","text":"Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object. For np.bytes, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Parameters: Name Type Description Default input_tensor np.array The bytes tensor to serialize. required Source code in kserve/protocol/infer_type.py 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 def serialize_byte_tensor ( input_tensor : np . ndarray ) -> np . ndarray : \"\"\" Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object. For np.bytes, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Args: input_tensor : np.array The bytes tensor to serialize. Returns: serialized_bytes_tensor : np.array The 1-D numpy array of type uint8 containing the serialized bytes in row-major form. Raises: InferenceError If unable to serialize the given tensor. \"\"\" if input_tensor . size == 0 : return np . empty ([ 0 ], dtype = np . object_ ) # If the input is a tensor of string/bytes objects, then must flatten those into # a 1-dimensional array containing the 4-byte byte size followed by the # actual element bytes. All elements are concatenated together in row-major # order. if ( input_tensor . dtype != np . object_ ) and ( input_tensor . dtype . type != np . bytes_ ): raise InferenceError ( \"cannot serialize bytes tensor: invalid datatype\" ) flattened_ls = [] # 'C' order is row-major. for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # If directly passing bytes to BYTES type, # don't convert it to str as Python will encode the # bytes which may distort the meaning if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : s = obj . item () else : s = str ( obj . item ()) . encode ( \"utf-8\" ) else : s = obj . item () flattened_ls . append ( struct . pack ( \"<I\" , len ( s ))) flattened_ls . append ( s ) flattened = b \"\" . join ( flattened_ls ) flattened_array = np . asarray ( flattened , dtype = np . object_ ) if not flattened_array . flags [ \"C_CONTIGUOUS\" ]: flattened_array = np . ascontiguousarray ( flattened_array , dtype = np . object_ ) return flattened_array","title":"serialize_byte_tensor"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.to_grpc_parameters","text":"Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. Source code in kserve/protocol/infer_type.py 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 def to_grpc_parameters ( parameters : Union [ Dict [ str , Union [ str , bool , int ]], MessageMap [ str , InferParameter ]] ) -> Dict [ str , InferParameter ]: \"\"\" Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. \"\"\" grpc_params : Dict [ str , InferParameter ] = {} for key , val in parameters . items (): if isinstance ( val , str ): grpc_params [ key ] = InferParameter ( string_param = val ) elif isinstance ( val , bool ): grpc_params [ key ] = InferParameter ( bool_param = val ) elif isinstance ( val , int ): grpc_params [ key ] = InferParameter ( int64_param = val ) elif isinstance ( val , InferParameter ): grpc_params [ key ] = val else : raise InvalidInput ( f \"to_grpc: invalid parameter value: { val } \" ) return grpc_params","title":"to_grpc_parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.to_http_parameters","text":"Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] Source code in kserve/protocol/infer_type.py 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 def to_http_parameters ( parameters : Union [ dict , MessageMap [ str , InferParameter ]] ) -> Dict [ str , Union [ str , bool , int ]]: \"\"\" Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] \"\"\" http_params : Dict [ str , Union [ str , bool , int ]] = {} for key , val in parameters . items (): if isinstance ( val , InferParameter ): if val . HasField ( \"bool_param\" ): http_params [ key ] = val . bool_param elif val . HasField ( \"int64_param\" ): http_params [ key ] = val . int64_param elif val . HasField ( \"string_param\" ): http_params [ key ] = val . string_param else : http_params [ key ] = val return http_params","title":"to_http_parameters"},{"location":"reference/api/","text":"Packages: serving.kserve.io/v1alpha1 serving.kserve.io/v1beta1 serving.kserve.io/v1alpha1 Package v1alpha1 contains API Schema definitions for the serving v1alpha1 API group Resource Types: BuiltInAdapter ( Appears on: ServingRuntimeSpec ) Field Description serverType ServerType ServerType must be one of the supported built-in types such as \u201ctriton\u201d or \u201cmlserver\u201d, and the runtime\u2019s container must have the same name runtimeManagementPort int Port which the runtime server listens for model management requests memBufferBytes int Fixed memory overhead to subtract from runtime container\u2019s memory allocation to determine model capacity modelLoadingTimeoutMillis int Timeout for model loading operations in milliseconds env []Kubernetes core/v1.EnvVar Environment variables used to control other aspects of the built-in adapter\u2019s behaviour (uncommon) ClusterServingRuntime ClusterServingRuntime is the Schema for the servingruntimes API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec ServingRuntimeSpec supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter status ServingRuntimeStatus ClusterStorageContainer Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec StorageContainerSpec container Kubernetes core/v1.Container Container spec for the storage initializer init container supportedUriFormats []SupportedUriFormat List of URI formats that this container supports disabled bool (Optional) InferenceGraph InferenceGraph is the Schema for the InferenceGraph API for multiple models Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec InferenceGraphSpec nodes map[string]kserve.io/serving/pkg/apis/serving/v1alpha1.InferenceRouter Map of InferenceGraph router nodes Each node defines the router which can be different routing types resources Kubernetes core/v1.ResourceRequirements (Optional) affinity Kubernetes core/v1.Affinity (Optional) timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). status InferenceGraphStatus InferenceGraphSpec ( Appears on: InferenceGraph ) InferenceGraphSpec defines the InferenceGraph spec Field Description nodes map[string]kserve.io/serving/pkg/apis/serving/v1alpha1.InferenceRouter Map of InferenceGraph router nodes Each node defines the router which can be different routing types resources Kubernetes core/v1.ResourceRequirements (Optional) affinity Kubernetes core/v1.Affinity (Optional) timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). InferenceGraphStatus ( Appears on: InferenceGraph ) InferenceGraphStatus defines the InferenceGraph conditions and status Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for InferenceGraph url knative.dev/pkg/apis.URL (Optional) Url for the InferenceGraph InferenceRouter ( Appears on: InferenceGraphSpec ) InferenceRouter defines the router for each InferenceGraph node with one or multiple steps kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \"{ .input.userId == 1 }\" - service: mymodel2 condition: \"{ .input.userId == 2 }\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \u201cmajority-vote\u201d method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \"dog\" } - service: cat-breed-classifier condition: { .predictions.class == \"cat\" } Field Description routerType InferenceRouterType RouterType Sequence: chain multiple inference steps with input/output from previous step Splitter: randomly routes to the target service according to the weight Ensemble: routes the request to multiple models and then merge the responses Switch: routes the request to one of the steps based on condition steps []InferenceStep (Optional) Steps defines destinations for the current router node InferenceRouterType ( string alias) ( Appears on: InferenceRouter ) InferenceRouterType constant for inference routing types Value Description \"Ensemble\" Ensemble router routes the requests to multiple models and then merge the responses \"Sequence\" Sequence Default type only route to one destination \"Splitter\" Splitter router randomly routes the requests to the named service according to the weight \"Switch\" Switch routes the request to the model based on certain condition InferenceStep ( Appears on: InferenceRouter ) InferenceStep defines the inference target of the current step with condition, weights and data. Field Description name string (Optional) Unique name for the step within this node InferenceTarget InferenceTarget (Members of InferenceTarget are embedded into this type.) Node or service used to process this step data string (Optional) request data sent to the next route with input/output from the previous step $request $response.predictions weight int64 (Optional) the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 condition string (Optional) routing based on the condition dependency InferenceStepDependencyType (Optional) to decide whether a step is a hard or a soft dependency in the Inference Graph InferenceStepDependencyType ( string alias) ( Appears on: InferenceStep ) InferenceStepDependencyType constant for inference step dependency Value Description \"Hard\" Hard \"Soft\" Soft InferenceTarget ( Appears on: InferenceStep ) Exactly one InferenceTarget field must be specified Field Description nodeName string (Optional) The node name for routing as next step serviceName string named reference for InferenceService serviceUrl string (Optional) InferenceService URL, mutually exclusive with ServiceName ModelSpec ( Appears on: TrainedModelSpec ) ModelSpec describes a TrainedModel Field Description storageUri string Storage URI for the model repository framework string Machine Learning The values could be: \u201ctensorflow\u201d,\u201cpytorch\u201d,\u201csklearn\u201d,\u201connx\u201d,\u201cxgboost\u201d, \u201cmyawesomeinternalframework\u201d etc. memory k8s.io/apimachinery/pkg/api/resource.Quantity Maximum memory this model will consume, this field is used to decide if a model server has enough memory to load this model. ScaleMetric ( string alias) ( Appears on: InferenceGraphSpec ) ScaleMetric enum ServerType ( string alias) ( Appears on: BuiltInAdapter ) ServerType constant for specifying the runtime name Value Description \"mlserver\" Model server is MLServer \"ovms\" Model server is OpenVino Model Server \"triton\" Model server is Triton ServingRuntime ServingRuntime is the Schema for the servingruntimes API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec ServingRuntimeSpec supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter status ServingRuntimeStatus ServingRuntimePodSpec ( Appears on: ServingRuntimeSpec ) Field Description containers []Kubernetes core/v1.Container List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. volumes []Kubernetes core/v1.Volume (Optional) List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes nodeSelector map[string]string (Optional) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node\u2019s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ affinity Kubernetes core/v1.Affinity (Optional) If specified, the pod\u2019s scheduling constraints tolerations []Kubernetes core/v1.Toleration (Optional) If specified, the pod\u2019s tolerations. labels map[string]string (Optional) Labels that will be add to the pod. More info: http://kubernetes.io/docs/user-guide/labels annotations map[string]string (Optional) Annotations that will be add to the pod. More info: http://kubernetes.io/docs/user-guide/annotations imagePullSecrets []Kubernetes core/v1.LocalObjectReference (Optional) ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod ServingRuntimeSpec ( Appears on: ClusterServingRuntime , ServingRuntime , SupportedRuntime ) ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out. Field Description supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter ServingRuntimeStatus ( Appears on: ClusterServingRuntime , ServingRuntime ) ServingRuntimeStatus defines the observed state of ServingRuntime StorageContainerSpec ( Appears on: ClusterStorageContainer ) StorageContainerSpec defines the container spec for the storage initializer init container, and the protocols it supports. Field Description container Kubernetes core/v1.Container Container spec for the storage initializer init container supportedUriFormats []SupportedUriFormat List of URI formats that this container supports StorageHelper ( Appears on: ServingRuntimeSpec ) Field Description disabled bool (Optional) SupportedModelFormat ( Appears on: ServingRuntimeSpec ) Field Description name string Name of the model format. version string (Optional) Version of the model format. Used in validating that a predictor is supported by a runtime. Can be \u201cmajor\u201d, \u201cmajor.minor\u201d or \u201cmajor.minor.patch\u201d. autoSelect bool (Optional) Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. priority int32 (Optional) Priority of this serving runtime for auto selection. This is used to select the serving runtime if more than one serving runtime supports the same model format. The value should be greater than zero. The higher the value, the higher the priority. Priority is not considered if AutoSelect is either false or not specified. Priority can be overridden by specifying the runtime in the InferenceService. SupportedRuntime SupportedRuntime is the schema for supported runtime result of automatic selection Field Description Name string Spec ServingRuntimeSpec SupportedUriFormat ( Appears on: StorageContainerSpec ) SupportedUriFormat can be either prefix or regex. Todo: Add validation that only one of them is set. Field Description prefix string regex string TrainedModel TrainedModel is the Schema for the TrainedModel API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec TrainedModelSpec inferenceService string parent inference service to deploy to model ModelSpec Predictor model spec status TrainedModelStatus TrainedModelSpec ( Appears on: TrainedModel ) TrainedModelSpec defines the TrainedModel spec Field Description inferenceService string parent inference service to deploy to model ModelSpec Predictor model spec TrainedModelStatus ( Appears on: TrainedModel ) TrainedModelStatus defines the observed state of TrainedModel Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for trained model url knative.dev/pkg/apis.URL URL holds the url that will distribute traffic over the provided traffic targets. For v1: http[s]://{route-name}.{route-namespace}.{cluster-level-suffix}/v1/models/ :predict For v2: http[s]://{route-name}.{route-namespace}.{cluster-level-suffix}/v2/models/ /infer address knative.dev/pkg/apis/duck/v1.Addressable Addressable endpoint for the deployed trained model http:// /v1/models/ .metadata.name Generated with gen-crd-api-reference-docs on git commit 1c51eeee . serving.kserve.io/v1beta1 Package v1beta1 contains API Schema definitions for the serving v1beta1 API group Resource Types: ARTExplainerSpec ( Appears on: ExplainerSpec ) ARTExplainerType defines the arguments for configuring an ART Explanation Server Field Description type ARTExplainerType The type of ART explainer ExplainerExtensionSpec ExplainerExtensionSpec (Members of ExplainerExtensionSpec are embedded into this type.) Contains fields shared across all explainers ARTExplainerType ( string alias) ( Appears on: ARTExplainerSpec ) Value Description \"SquareAttack\" Batcher ( Appears on: ComponentExtensionSpec ) Batcher specifies optional payload batching available for all components Field Description maxBatchSize int (Optional) Specifies the max number of requests to trigger a batch maxLatency int (Optional) Specifies the max latency to trigger a batch timeout int (Optional) Specifies the timeout of a batch Component Component interface is implemented by all specs that contain component implementations, e.g. PredictorSpec, ExplainerSpec, TransformerSpec. ComponentExtensionSpec ( Appears on: ExplainerSpec , PredictorSpec , TransformerSpec ) ComponentExtensionSpec defines the deployment configuration for a given InferenceService component Field Description minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). containerConcurrency int64 (Optional) ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency( https://knative.dev/docs/serving/autoscaling/concurrency ). timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. canaryTrafficPercent int64 (Optional) CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision logger LoggerSpec (Optional) Activate request/response logging and logger configurations batcher Batcher (Optional) Activate request batching and batching configurations labels map[string]string (Optional) Labels that will be add to the component pod. More info: http://kubernetes.io/docs/user-guide/labels annotations map[string]string (Optional) Annotations that will be add to the component pod. More info: http://kubernetes.io/docs/user-guide/annotations deploymentStrategy Kubernetes apps/v1.DeploymentStrategy (Optional) The deployment strategy to use to replace existing pods with new ones. Only applicable for raw deployment mode. ComponentImplementation ComponentImplementation interface is implemented by predictor, transformer, and explainer implementations ComponentStatusSpec ( Appears on: InferenceServiceStatus ) ComponentStatusSpec describes the state of the component Field Description latestReadyRevision string (Optional) Latest revision name that is in ready state latestCreatedRevision string (Optional) Latest revision name that is created previousRolledoutRevision string (Optional) Previous revision name that is rolled out with 100 percent traffic latestRolledoutRevision string (Optional) Latest revision name that is rolled out with 100 percent traffic traffic []knative.dev/serving/pkg/apis/serving/v1.TrafficTarget (Optional) Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. url knative.dev/pkg/apis.URL (Optional) URL holds the primary url that will distribute traffic over the provided traffic targets. This will be one the REST or gRPC endpoints that are available. It generally has the form http[s]://{route-name}.{route-namespace}.{cluster-level-suffix} restUrl knative.dev/pkg/apis.URL (Optional) REST endpoint of the component if available. grpcUrl knative.dev/pkg/apis.URL (Optional) gRPC endpoint of the component if available. address knative.dev/pkg/apis/duck/v1.Addressable (Optional) Addressable endpoint for the InferenceService ComponentType ( string alias) ComponentType contains the different types of components of the service Value Description \"explainer\" \"predictor\" \"transformer\" CustomExplainer CustomExplainer defines arguments for configuring a custom explainer. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) CustomPredictor CustomPredictor defines arguments for configuring a custom server. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) CustomTransformer CustomTransformer defines arguments for configuring a custom transformer. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) DeployConfig Field Description defaultDeploymentMode string ExplainerConfig ( Appears on: ExplainersConfig ) Field Description image string explainer docker image name defaultImageVersion string default explainer docker image version ExplainerExtensionSpec ( Appears on: ARTExplainerSpec ) ExplainerExtensionSpec defines configuration shared across all explainer frameworks Field Description storageUri string The location of a trained explanation model runtimeVersion string Defaults to latest Explainer Version config map[string]string Inline custom parameter settings for explainer Container Kubernetes core/v1.Container (Members of Container are embedded into this type.) (Optional) Container enables overrides for the predictor. Each framework will have different defaults that are populated in the underlying container spec. storage StorageSpec (Optional) Storage Spec for model location ExplainerSpec ( Appears on: InferenceServiceSpec ) ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \u201c1-of\u201d semantic. Users must specify exactly one spec. Field Description art ARTExplainerSpec Spec for ART explainer PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Users may choose to provide a full PodSpec for their custom explainer. The field PodSpec.Containers is mutually exclusive with other explainers. 2) Users may choose to provide a Explainer and specify PodSpec overrides in the PodSpec. They must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for explainer ExplainersConfig ( Appears on: InferenceServicesConfig ) Field Description art ExplainerConfig FailureInfo ( Appears on: ModelStatus ) Field Description location string (Optional) Name of component to which the failure relates (usually Pod name) reason FailureReason (Optional) High level class of failure message string (Optional) Detailed error message modelRevisionName string (Optional) Internal Revision/ID of model, tied to specific Spec contents time Kubernetes meta/v1.Time (Optional) Time failure occurred or was discovered exitCode int32 (Optional) Exit status from the last termination of the container FailureReason ( string alias) ( Appears on: FailureInfo ) FailureReason enum Value Description \"InvalidPredictorSpec\" The current Predictor Spec is invalid or unsupported \"ModelLoadFailed\" The model failed to load within a ServingRuntime container \"NoSupportingRuntime\" There are no ServingRuntime which support the specified model type \"RuntimeDisabled\" The ServingRuntime is disabled \"RuntimeNotRecognized\" There is no ServingRuntime defined with the specified runtime name \"RuntimeUnhealthy\" Corresponding ServingRuntime containers failed to start or are unhealthy HuggingFaceRuntimeSpec ( Appears on: PredictorSpec ) HuggingFaceRuntimeSpec defines arguments for configuring HuggingFace model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors InferenceService InferenceService is the Schema for the InferenceServices API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec InferenceServiceSpec predictor PredictorSpec Predictor defines the model serving spec explainer ExplainerSpec (Optional) Explainer defines the model explanation service spec, explainer service calls to predictor or transformer if it is specified. transformer TransformerSpec (Optional) Transformer defines the pre/post processing before and after the predictor call, transformer service calls to predictor service. status InferenceServiceStatus InferenceServiceSpec ( Appears on: InferenceService ) InferenceServiceSpec is the top level type for this resource Field Description predictor PredictorSpec Predictor defines the model serving spec explainer ExplainerSpec (Optional) Explainer defines the model explanation service spec, explainer service calls to predictor or transformer if it is specified. transformer TransformerSpec (Optional) Transformer defines the pre/post processing before and after the predictor call, transformer service calls to predictor service. InferenceServiceStatus ( Appears on: InferenceService ) InferenceServiceStatus defines the observed state of InferenceService Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for the InferenceService - PredictorReady: predictor readiness condition; - TransformerReady: transformer readiness condition; - ExplainerReady: explainer readiness condition; - RoutesReady (serverless mode only): aggregated routing condition, i.e. endpoint readiness condition; - LatestDeploymentReady (serverless mode only): aggregated configuration condition, i.e. latest deployment readiness condition; - Ready: aggregated condition; address knative.dev/pkg/apis/duck/v1.Addressable (Optional) Addressable endpoint for the InferenceService url knative.dev/pkg/apis.URL (Optional) URL holds the url that will distribute traffic over the provided traffic targets. It generally has the form http[s]://{route-name}.{route-namespace}.{cluster-level-suffix} components map[kserve.io/serving/pkg/apis/serving/v1beta1.ComponentType]kserve.io/serving/pkg/apis/serving/v1beta1.ComponentStatusSpec Statuses for the components of the InferenceService modelStatus ModelStatus Model related statuses InferenceServicesConfig Field Description explainers ExplainersConfig Explainer configurations IngressConfig Field Description ingressGateway string ingressService string localGateway string localGatewayService string ingressDomain string ingressClassName string additionalIngressDomains []string domainTemplate string urlScheme string disableIstioVirtualHost bool pathTemplate string disableIngressCreation bool LightGBMSpec ( Appears on: PredictorSpec ) LightGBMSpec defines arguments for configuring LightGBMSpec model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors LoggerSpec ( Appears on: ComponentExtensionSpec ) LoggerSpec specifies optional payload logging available for all components Field Description url string (Optional) URL to send logging events mode LoggerType (Optional) Specifies the scope of the loggers. Valid values are: - \u201call\u201d (default): log both request and response; - \u201crequest\u201d: log only request; - \u201cresponse\u201d: log only response LoggerType ( string alias) ( Appears on: LoggerSpec ) LoggerType controls the scope of log publishing Value Description \"all\" Logger mode to log both request and response \"request\" Logger mode to log only request \"response\" Logger mode to log only response ModelCopies ( Appears on: ModelStatus ) Field Description failedCopies int How many copies of this predictor\u2019s models failed to load recently totalCopies int (Optional) Total number copies of this predictor\u2019s models that are currently loaded ModelFormat ( Appears on: ModelSpec ) Field Description name string Name of the model format. version string (Optional) Version of the model format. Used in validating that a predictor is supported by a runtime. Can be \u201cmajor\u201d, \u201cmajor.minor\u201d or \u201cmajor.minor.patch\u201d. ModelRevisionStates ( Appears on: ModelStatus ) Field Description activeModelState ModelState High level state string: Pending, Standby, Loading, Loaded, FailedToLoad targetModelState ModelState ModelSpec ( Appears on: PredictorSpec ) Field Description modelFormat ModelFormat ModelFormat being served. runtime string (Optional) Specific ClusterServingRuntime/ServingRuntime name to use for deployment. PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) ModelState ( string alias) ( Appears on: ModelRevisionStates ) ModelState enum Value Description \"FailedToLoad\" All copies of the model failed to load \"Loaded\" At least one copy of the model is loaded \"Loading\" Model is loading \"Pending\" Model is not yet registered \"Standby\" Model is available but not loaded (will load when used) ModelStatus ( Appears on: InferenceServiceStatus ) Field Description transitionStatus TransitionStatus Whether the available predictor endpoints reflect the current Spec or is in transition states ModelRevisionStates (Optional) State information of the predictor\u2019s model. lastFailureInfo FailureInfo (Optional) Details of last failure, when load of target model is failed or blocked. copies ModelCopies (Optional) Model copy information of the predictor\u2019s model. ONNXRuntimeSpec ( Appears on: PredictorSpec ) ONNXRuntimeSpec defines arguments for configuring ONNX model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors PMMLSpec ( Appears on: PredictorSpec ) PMMLSpec defines arguments for configuring PMML model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors PaddleServerSpec ( Appears on: PredictorSpec ) Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) PodSpec ( Appears on: ExplainerSpec , PredictorSpec , TransformerSpec ) PodSpec is a description of a pod. Field Description volumes []Kubernetes core/v1.Volume (Optional) List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes initContainers []Kubernetes core/v1.Container List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ containers []Kubernetes core/v1.Container List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. ephemeralContainers []Kubernetes core/v1.EphemeralContainer (Optional) List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod\u2019s ephemeralcontainers subresource. This field is beta-level and available on clusters that haven\u2019t disabled the EphemeralContainers feature gate. restartPolicy Kubernetes core/v1.RestartPolicy (Optional) Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy terminationGracePeriodSeconds int64 (Optional) Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. activeDeadlineSeconds int64 (Optional) Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. dnsPolicy Kubernetes core/v1.DNSPolicy (Optional) Set DNS policy for the pod. Defaults to \u201cClusterFirst\u201d. Valid values are \u2018ClusterFirstWithHostNet\u2019, \u2018ClusterFirst\u2019, \u2018Default\u2019 or \u2018None\u2019. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to \u2018ClusterFirstWithHostNet\u2019. nodeSelector map[string]string (Optional) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node\u2019s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ serviceAccountName string (Optional) ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ serviceAccount string (Optional) DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. automountServiceAccountToken bool (Optional) AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. nodeName string (Optional) NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. hostNetwork bool (Optional) Host networking requested for this pod. Use the host\u2019s network namespace. If this option is set, the ports that will be used must be specified. Default to false. hostPID bool (Optional) Use the host\u2019s pid namespace. Optional: Default to false. hostIPC bool (Optional) Use the host\u2019s ipc namespace. Optional: Default to false. shareProcessNamespace bool (Optional) Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. securityContext Kubernetes core/v1.PodSecurityContext (Optional) SecurityContext holds pod-level security attributes and common container settings. Optional: Defaults to empty. See type description for default values of each field. imagePullSecrets []Kubernetes core/v1.LocalObjectReference (Optional) ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod hostname string (Optional) Specifies the hostname of the Pod If not specified, the pod\u2019s hostname will be set to a system-defined value. subdomain string (Optional) If specified, the fully qualified Pod hostname will be \u201c . . .svc. \u201d. If not specified, the pod will not have a domainname at all. affinity Kubernetes core/v1.Affinity (Optional) If specified, the pod\u2019s scheduling constraints schedulerName string (Optional) If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. tolerations []Kubernetes core/v1.Toleration (Optional) If specified, the pod\u2019s tolerations. hostAliases []Kubernetes core/v1.HostAlias (Optional) HostAliases is an optional list of hosts and IPs that will be injected into the pod\u2019s hosts file if specified. This is only valid for non-hostNetwork pods. priorityClassName string (Optional) If specified, indicates the pod\u2019s priority. \u201csystem-node-critical\u201d and \u201csystem-cluster-critical\u201d are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. priority int32 (Optional) The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. dnsConfig Kubernetes core/v1.PodDNSConfig (Optional) Specifies the DNS parameters of a pod. Parameters specified here will be merged to the generated DNS configuration based on DNSPolicy. readinessGates []Kubernetes core/v1.PodReadinessGate (Optional) If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \u201cTrue\u201d More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates runtimeClassName string (Optional) RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \u201clegacy\u201d RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. enableServiceLinks bool (Optional) EnableServiceLinks indicates whether information about services should be injected into pod\u2019s environment variables, matching the syntax of Docker links. Optional: Defaults to true. preemptionPolicy Kubernetes core/v1.PreemptionPolicy (Optional) PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. overhead Kubernetes core/v1.ResourceList (Optional) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. topologySpreadConstraints []Kubernetes core/v1.TopologySpreadConstraint (Optional) TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. setHostnameAsFQDN bool (Optional) If true the pod\u2019s hostname will be configured as the pod\u2019s FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. os Kubernetes core/v1.PodOS (Optional) Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set. If the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions If the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[ ].securityContext.seLinuxOptions - spec.containers[ ].securityContext.seccompProfile - spec.containers[ ].securityContext.capabilities - spec.containers[ ].securityContext.readOnlyRootFilesystem - spec.containers[ ].securityContext.privileged - spec.containers[ ].securityContext.allowPrivilegeEscalation - spec.containers[ ].securityContext.procMount - spec.containers[ ].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup This is an alpha field and requires the IdentifyPodOS feature hostUsers bool (Optional) Use the host\u2019s user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature. schedulingGates []Kubernetes core/v1.PodSchedulingGate (Optional) SchedulingGates is an opaque list of values that if specified will block scheduling the pod. If schedulingGates is not empty, the pod will stay in the SchedulingGated state and the scheduler will not attempt to schedule the pod. SchedulingGates can only be set at pod creation time, and be removed only afterwards. This is a beta feature enabled by the PodSchedulingReadiness feature gate. resourceClaims []Kubernetes core/v1.PodResourceClaim (Optional) ResourceClaims defines which ResourceClaims must be allocated and reserved before the Pod is allowed to start. The resources will be made available to those containers which consume them by name. This is an alpha field and requires enabling the DynamicResourceAllocation feature gate. This field is immutable. PredictorExtensionSpec ( Appears on: HuggingFaceRuntimeSpec , LightGBMSpec , ModelSpec , ONNXRuntimeSpec , PMMLSpec , PaddleServerSpec , SKLearnSpec , TFServingSpec , TorchServeSpec , TritonSpec , XGBoostSpec ) PredictorExtensionSpec defines configuration shared across all predictor frameworks Field Description storageUri string (Optional) This field points to the location of the trained model which is mounted onto the pod. runtimeVersion string (Optional) Runtime version of the predictor docker image protocolVersion github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) Container Kubernetes core/v1.Container (Members of Container are embedded into this type.) (Optional) Container enables overrides for the predictor. Each framework will have different defaults that are populated in the underlying container spec. storage StorageSpec (Optional) Storage Spec for model location PredictorImplementation PredictorImplementation defines common functions for all predictors e.g Tensorflow, Triton, etc PredictorSpec ( Appears on: InferenceServiceSpec ) PredictorSpec defines the configuration for a predictor, The following fields follow a \u201c1-of\u201d semantic. Users must specify exactly one spec. Field Description sklearn SKLearnSpec Spec for SKLearn model server xgboost XGBoostSpec Spec for XGBoost model server tensorflow TFServingSpec Spec for TFServing ( https://github.com/tensorflow/serving ) pytorch TorchServeSpec Spec for TorchServe ( https://pytorch.org/serve ) triton TritonSpec Spec for Triton Inference Server ( https://github.com/triton-inference-server/server ) onnx ONNXRuntimeSpec Spec for ONNX runtime ( https://github.com/microsoft/onnxruntime ) huggingface HuggingFaceRuntimeSpec Spec for HuggingFace runtime ( https://github.com/huggingface ) pmml PMMLSpec Spec for PMML ( http://dmg.org/pmml/v4-1/GeneralStructure.html ) lightgbm LightGBMSpec Spec for LightGBM model server paddle PaddleServerSpec Spec for Paddle model server ( https://github.com/PaddlePaddle/Serving ) model ModelSpec Model spec for any arbitrary framework. PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Provide a full PodSpec for custom predictor. The field PodSpec.Containers is mutually exclusive with other predictors (i.e. TFServing). 2) Provide a predictor (i.e. TFServing) and specify PodSpec overrides, you must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for a predictor SKLearnSpec ( Appears on: PredictorSpec ) SKLearnSpec defines arguments for configuring SKLearn model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors ScaleMetric ( string alias) ( Appears on: ComponentExtensionSpec ) ScaleMetric enum Value Description \"cpu\" \"concurrency\" \"memory\" \"rps\" StorageSpec ( Appears on: ExplainerExtensionSpec , PredictorExtensionSpec ) Field Description path string (Optional) The path to the model object in the storage. It cannot co-exist with the storageURI. schemaPath string (Optional) The path to the model schema file in the storage. parameters map[string]string (Optional) Parameters to override the default storage credentials and config. key string (Optional) The Storage Key in the secret for this model. TFServingSpec ( Appears on: PredictorSpec ) TFServingSpec defines arguments for configuring Tensorflow model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors TorchServeSpec ( Appears on: PredictorSpec ) TorchServeSpec defines arguments for configuring PyTorch model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors TransformerSpec ( Appears on: InferenceServiceSpec ) TransformerSpec defines transformer service for pre/post processing Field Description PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Provide a full PodSpec for custom transformer. The field PodSpec.Containers is mutually exclusive with other transformers. 2) Provide a transformer and specify PodSpec overrides, you must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for a transformer TransitionStatus ( string alias) ( Appears on: ModelStatus ) TransitionStatus enum Value Description \"BlockedByFailedLoad\" Target model failed to load \"InProgress\" Waiting for target model to reach state of active model \"InvalidSpec\" Target predictor spec failed validation \"UpToDate\" Predictor is up-to-date (reflects current spec) TritonSpec ( Appears on: PredictorSpec ) TritonSpec defines arguments for configuring Triton model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors XGBoostSpec ( Appears on: PredictorSpec ) XGBoostSpec defines arguments for configuring XGBoost model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors Generated with gen-crd-api-reference-docs on git commit 1c51eeee .","title":"Control Plane API"},{"location":"reference/swagger-ui/","text":"Open Inference Protocol API Specification \u00b6 REST \u00b6 GRPC \u00b6 ServerLive \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse ServerReady \u00b6 The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse ModelReady \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse ServerMetadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse ModelMetadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse ModelInfer \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse Messages \u00b6 InferParameter \u00b6 An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value. InferTensorContents \u00b6 The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. ModelInferRequest \u00b6 Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. | ModelInferRequest.InferInputTensor \u00b6 An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request. ModelInferRequest.InferInputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.InferRequestedOutputTensor \u00b6 An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters. ModelInferRequest.InferRequestedOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse \u00b6 Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. | ModelInferResponse.InferOutputTensor \u00b6 An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response. ModelInferResponse.InferOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelMetadataRequest \u00b6 Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelMetadataResponse \u00b6 Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs. ModelMetadataResponse.TensorMetadata \u00b6 Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value. ModelReadyRequest \u00b6 Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelReadyResponse \u00b6 Field Type Description ready bool True if the model is ready, false if not ready. ServerLiveRequest \u00b6 ServerLiveResponse \u00b6 Field Type Description live bool True if the inference server is live, false if not live. ServerMetadataRequest \u00b6 ServerMetadataResponse \u00b6 Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server. ServerReadyRequest \u00b6 ServerReadyResponse \u00b6 Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"Open Inference Protocol API Spec"},{"location":"reference/swagger-ui/#open-inference-protocol-api-specification","text":"","title":"Open Inference Protocol API Specification"},{"location":"reference/swagger-ui/#rest","text":"","title":"REST"},{"location":"reference/swagger-ui/#grpc","text":"","title":"GRPC"},{"location":"reference/swagger-ui/#serverlive","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse","title":"ServerLive"},{"location":"reference/swagger-ui/#serverready","text":"The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse","title":"ServerReady"},{"location":"reference/swagger-ui/#modelready","text":"The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse","title":"ModelReady"},{"location":"reference/swagger-ui/#servermetadata","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse","title":"ServerMetadata"},{"location":"reference/swagger-ui/#modelmetadata","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse","title":"ModelMetadata"},{"location":"reference/swagger-ui/#modelinfer","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse","title":"ModelInfer"},{"location":"reference/swagger-ui/#messages","text":"","title":"Messages"},{"location":"reference/swagger-ui/#inferparameter","text":"An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value.","title":"InferParameter"},{"location":"reference/swagger-ui/#infertensorcontents","text":"The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements.","title":"InferTensorContents"},{"location":"reference/swagger-ui/#modelinferrequest","text":"Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. |","title":"ModelInferRequest"},{"location":"reference/swagger-ui/#modelinferrequestinferinputtensor","text":"An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request.","title":"ModelInferRequest.InferInputTensor"},{"location":"reference/swagger-ui/#modelinferrequestinferinputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferInputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferrequestinferrequestedoutputtensor","text":"An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters.","title":"ModelInferRequest.InferRequestedOutputTensor"},{"location":"reference/swagger-ui/#modelinferrequestinferrequestedoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferRequestedOutputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferrequestparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferresponse","text":"Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. |","title":"ModelInferResponse"},{"location":"reference/swagger-ui/#modelinferresponseinferoutputtensor","text":"An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response.","title":"ModelInferResponse.InferOutputTensor"},{"location":"reference/swagger-ui/#modelinferresponseinferoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.InferOutputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferresponseparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.ParametersEntry"},{"location":"reference/swagger-ui/#modelmetadatarequest","text":"Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelMetadataRequest"},{"location":"reference/swagger-ui/#modelmetadataresponse","text":"Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs.","title":"ModelMetadataResponse"},{"location":"reference/swagger-ui/#modelmetadataresponsetensormetadata","text":"Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value.","title":"ModelMetadataResponse.TensorMetadata"},{"location":"reference/swagger-ui/#modelreadyrequest","text":"Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelReadyRequest"},{"location":"reference/swagger-ui/#modelreadyresponse","text":"Field Type Description ready bool True if the model is ready, false if not ready.","title":"ModelReadyResponse"},{"location":"reference/swagger-ui/#serverliverequest","text":"","title":"ServerLiveRequest"},{"location":"reference/swagger-ui/#serverliveresponse","text":"Field Type Description live bool True if the inference server is live, false if not live.","title":"ServerLiveResponse"},{"location":"reference/swagger-ui/#servermetadatarequest","text":"","title":"ServerMetadataRequest"},{"location":"reference/swagger-ui/#servermetadataresponse","text":"Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server.","title":"ServerMetadataResponse"},{"location":"reference/swagger-ui/#serverreadyrequest","text":"","title":"ServerReadyRequest"},{"location":"reference/swagger-ui/#serverreadyresponse","text":"Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"ServerReadyResponse"},{"location":"reference/v2_inference/","text":"ServerLive \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse ServerReady \u00b6 The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse ModelReady \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse ServerMetadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse ModelMetadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse ModelInfer \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse Messages \u00b6 InferParameter \u00b6 An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value. InferTensorContents \u00b6 The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. ModelInferRequest \u00b6 Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. | ModelInferRequest.InferInputTensor \u00b6 An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request. ModelInferRequest.InferInputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.InferRequestedOutputTensor \u00b6 An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters. ModelInferRequest.InferRequestedOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse \u00b6 Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. | ModelInferResponse.InferOutputTensor \u00b6 An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response. ModelInferResponse.InferOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelMetadataRequest \u00b6 Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelMetadataResponse \u00b6 Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs. ModelMetadataResponse.TensorMetadata \u00b6 Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value. ModelReadyRequest \u00b6 Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelReadyResponse \u00b6 Field Type Description ready bool True if the model is ready, false if not ready. ServerLiveRequest \u00b6 ServerLiveResponse \u00b6 Field Type Description live bool True if the inference server is live, false if not live. ServerMetadataRequest \u00b6 ServerMetadataResponse \u00b6 Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server. ServerReadyRequest \u00b6 ServerReadyResponse \u00b6 Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"Index"},{"location":"reference/v2_inference/#serverlive","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse","title":"ServerLive"},{"location":"reference/v2_inference/#serverready","text":"The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse","title":"ServerReady"},{"location":"reference/v2_inference/#modelready","text":"The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse","title":"ModelReady"},{"location":"reference/v2_inference/#servermetadata","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse","title":"ServerMetadata"},{"location":"reference/v2_inference/#modelmetadata","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse","title":"ModelMetadata"},{"location":"reference/v2_inference/#modelinfer","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse","title":"ModelInfer"},{"location":"reference/v2_inference/#messages","text":"","title":"Messages"},{"location":"reference/v2_inference/#inferparameter","text":"An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value.","title":"InferParameter"},{"location":"reference/v2_inference/#infertensorcontents","text":"The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements.","title":"InferTensorContents"},{"location":"reference/v2_inference/#modelinferrequest","text":"Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. |","title":"ModelInferRequest"},{"location":"reference/v2_inference/#modelinferrequestinferinputtensor","text":"An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request.","title":"ModelInferRequest.InferInputTensor"},{"location":"reference/v2_inference/#modelinferrequestinferinputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferInputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferrequestinferrequestedoutputtensor","text":"An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters.","title":"ModelInferRequest.InferRequestedOutputTensor"},{"location":"reference/v2_inference/#modelinferrequestinferrequestedoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferRequestedOutputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferrequestparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.ParametersEntry"},{"location":"reference/v2_inference/#modelinferresponse","text":"Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. |","title":"ModelInferResponse"},{"location":"reference/v2_inference/#modelinferresponseinferoutputtensor","text":"An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response.","title":"ModelInferResponse.InferOutputTensor"},{"location":"reference/v2_inference/#modelinferresponseinferoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.InferOutputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferresponseparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.ParametersEntry"},{"location":"reference/v2_inference/#modelmetadatarequest","text":"Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelMetadataRequest"},{"location":"reference/v2_inference/#modelmetadataresponse","text":"Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs.","title":"ModelMetadataResponse"},{"location":"reference/v2_inference/#modelmetadataresponsetensormetadata","text":"Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value.","title":"ModelMetadataResponse.TensorMetadata"},{"location":"reference/v2_inference/#modelreadyrequest","text":"Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelReadyRequest"},{"location":"reference/v2_inference/#modelreadyresponse","text":"Field Type Description ready bool True if the model is ready, false if not ready.","title":"ModelReadyResponse"},{"location":"reference/v2_inference/#serverliverequest","text":"","title":"ServerLiveRequest"},{"location":"reference/v2_inference/#serverliveresponse","text":"Field Type Description live bool True if the inference server is live, false if not live.","title":"ServerLiveResponse"},{"location":"reference/v2_inference/#servermetadatarequest","text":"","title":"ServerMetadataRequest"},{"location":"reference/v2_inference/#servermetadataresponse","text":"Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server.","title":"ServerMetadataResponse"},{"location":"reference/v2_inference/#serverreadyrequest","text":"","title":"ServerReadyRequest"},{"location":"reference/v2_inference/#serverreadyresponse","text":"Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"ServerReadyResponse"},{"location":"reference/v2_inference/template/","text":"Macro Syntax Error \u00b6 File : reference/v2_inference/template.md Line 1 in Markdown file: unexpected char '$' at 8 {{range $file := .Files}}","title":"Macro Syntax Error"},{"location":"reference/v2_inference/template/#macro-syntax-error","text":"File : reference/v2_inference/template.md Line 1 in Markdown file: unexpected char '$' at 8 {{range $file := .Files}}","title":"Macro Syntax Error"},{"location":"sdk_docs/sdk_doc/","text":"KServe Python SDK \u00b6 Python SDK for KServe controller plane client and data plane serving runtime API. Installation \u00b6 KServe Python SDK can be installed by pip or poetry . pip install \u00b6 pip install kserve Poetry \u00b6 Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install KServe Serving Runtime API \u00b6 KServe's python serving runtime API implements the open inference protocol using FastAPI , see Serving Runtime API docs for more details. KServe Client API \u00b6 KServe's python client interacts with KServe control plane APIs for executing operations on a remote KServe cluster, such as creating, patching and deleting of a InferenceService instance. Getting Started \u00b6 Please see the Sample for Python SDK Client to get started. KServe Client API Reference \u00b6 Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready Reference for Generated Data Models \u00b6 KnativeAddressable KnativeCondition KnativeURL KnativeVolatileTime NetUrlUserinfo V1beta1AIXExplainerSpec V1beta1AlibiExplainerSpec V1beta1Batcher V1beta1ComponentExtensionSpec V1beta1ComponentStatusSpec V1beta1CustomExplainer V1beta1CustomPredictor V1beta1CustomTransformer V1beta1ExplainerSpec V1beta1InferenceService V1beta1InferenceServiceList V1beta1InferenceServiceSpec V1beta1InferenceServiceStatus V1alpha1InferenceGraph V1alpha1InferenceGraphList V1alpha1InferenceGraphSpec V1alpha1InferenceGraphStatus V1beta1LightGBMSpec V1beta1LoggerSpec V1beta1ModelSpec V1beta1ModelStatus V1beta1ONNXRuntimeSpec V1beta1PaddleServerSpec V1beta1PMMLSpec V1beta1PodSpec V1beta1PredictorExtensionSpec V1beta1PredictorSpec V1beta1SKLearnSpec V1beta1TFServingSpec V1beta1TorchServeSpec V1beta1TransformerSpec V1beta1TritonSpec V1beta1XGBoostSpec","title":"Python Client SDK"},{"location":"sdk_docs/sdk_doc/#kserve-python-sdk","text":"Python SDK for KServe controller plane client and data plane serving runtime API.","title":"KServe Python SDK"},{"location":"sdk_docs/sdk_doc/#installation","text":"KServe Python SDK can be installed by pip or poetry .","title":"Installation"},{"location":"sdk_docs/sdk_doc/#pip-install","text":"pip install kserve","title":"pip install"},{"location":"sdk_docs/sdk_doc/#poetry","text":"Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install","title":"Poetry"},{"location":"sdk_docs/sdk_doc/#kserve-serving-runtime-api","text":"KServe's python serving runtime API implements the open inference protocol using FastAPI , see Serving Runtime API docs for more details.","title":"KServe Serving Runtime API"},{"location":"sdk_docs/sdk_doc/#kserve-client-api","text":"KServe's python client interacts with KServe control plane APIs for executing operations on a remote KServe cluster, such as creating, patching and deleting of a InferenceService instance.","title":"KServe Client API"},{"location":"sdk_docs/sdk_doc/#getting-started","text":"Please see the Sample for Python SDK Client to get started.","title":"Getting Started"},{"location":"sdk_docs/sdk_doc/#kserve-client-api-reference","text":"Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready","title":"KServe Client API Reference"},{"location":"sdk_docs/sdk_doc/#reference-for-generated-data-models","text":"KnativeAddressable KnativeCondition KnativeURL KnativeVolatileTime NetUrlUserinfo V1beta1AIXExplainerSpec V1beta1AlibiExplainerSpec V1beta1Batcher V1beta1ComponentExtensionSpec V1beta1ComponentStatusSpec V1beta1CustomExplainer V1beta1CustomPredictor V1beta1CustomTransformer V1beta1ExplainerSpec V1beta1InferenceService V1beta1InferenceServiceList V1beta1InferenceServiceSpec V1beta1InferenceServiceStatus V1alpha1InferenceGraph V1alpha1InferenceGraphList V1alpha1InferenceGraphSpec V1alpha1InferenceGraphStatus V1beta1LightGBMSpec V1beta1LoggerSpec V1beta1ModelSpec V1beta1ModelStatus V1beta1ONNXRuntimeSpec V1beta1PaddleServerSpec V1beta1PMMLSpec V1beta1PodSpec V1beta1PredictorExtensionSpec V1beta1PredictorSpec V1beta1SKLearnSpec V1beta1TFServingSpec V1beta1TorchServeSpec V1beta1TransformerSpec V1beta1TritonSpec V1beta1XGBoostSpec","title":"Reference for Generated Data Models"},{"location":"sdk_docs/docs/KServeClient/","text":"KServeClient \u00b6 KServeClient(config_file=None, context=None, client_configuration=None, persist_config=True) User can loads authentication and cluster information from kube-config file and stores them in kubernetes.client.configuration. Parameters are as following: parameter Description config_file Name of the kube-config file. Defaults to ~/.kube/config . Note that for the case that the SDK is running in cluster and you want to operate KServe in another remote cluster, user must set config_file to load kube-config file explicitly, e.g. KServeClient(config_file=\"~/.kube/config\") . context Set the active context. If is set to None, current_context from config file will be used. client_configuration The kubernetes.client.Configuration to set configs to. persist_config If True, config file will be updated when changed (e.g GCP token refresh). The APIs for KServeClient are as following: Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready set_credentials \u00b6 set_credentials(storage_type, namespace=None, credentials_file=None, service_account='kfserving-service-credentials', **kwargs): Create or update a Secret and Service Account for GCS and S3 for the provided credentials. Once the Service Account is applied, it may be used in the Service Account field of a InferenceService's V1beta1ModelSpec . Example \u00b6 Example for creating GCP credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'GCS' , namespace = 'kubeflow' , credentials_file = '/tmp/gcp.json' , service_account = 'user_specified_sa_name' ) The API supports specifying a Service Account by service_account , or using default Service Account kfserving-service-credentials , if the Service Account does not exist, the API will create it and attach the created secret with the Service Account, if exists, only patch it to attach the created Secret. Example for creating S3 credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'S3' , namespace = 'kubeflow' , credentials_file = '/tmp/awcredentials' , s3_profile = 'default' , s3_endpoint = 's3.us-west-amazonaws.com' , s3_region = 'us-west-2' , s3_use_https = '1' , s3_verify_ssl = '0' ) Example for creating Azure credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'Azure' , namespace = 'kubeflow' , credentials_file = '/path/azure_credentials.json' ) The created or patched Secret and Service Account will be shown as following: INFO:kfserving.api.set_credentials:Created Secret: kfserving-secret-6tv6l in namespace kubeflow INFO:kfserving.api.set_credentials:Created (or Patched) Service account: kfserving-service-credentials in namespace kubeflow Parameters \u00b6 Name Type Storage Type Description storage_type str All Required. Valid values: GCS, S3 or Azure namespace str All Optional. The kubernetes namespace. Defaults to current or default namespace. credentials_file str All Optional. The path for the credentials file. The default file for GCS is ~/.config/gcloud/application_default_credentials.json , see the instructions on creating the GCS credentials file. For S3 is ~/.aws/credentials , see the instructions on creating the S3 credentials file. For Azure is ~/.azure/azure_credentials.json , see the instructions on creating the Azure credentials file. service_account str All Optional. The name of service account. Supports specifying the service_account , or using default Service Account kfserving-service-credentials . If the Service Account does not exist, the API will create it and attach the created Secret with the Service Account, if exists, only patch it to attach the created Secret. s3_endpoint str S3 only Optional. The S3 endpoint. s3_region str S3 only Optional. The S3 region By default, regional endpoint is used for S3. s3_use_https str S3 only Optional. HTTPS is used to access S3 by default, unless s3_use_https=0 s3_verify_ssl str S3 only Optional. If HTTPS is used, SSL verification could be disabled with s3_verify_ssl=0 create \u00b6 create(inferenceservice, namespace=None, watch=False, timeout_seconds=600) Create the provided InferenceService in the specified namespace Example \u00b6 from kubernetes import client from kserve import KServeClient from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = default_model_spec ) kserve = KServeClient () kserve . create ( isvc ) # The API also supports watching the created InferenceService status till it's READY. # kserve.create(isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str Namespace for InferenceService deploying to. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the created InferenceService if True , otherwise will return the created InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object get \u00b6 get(name=None, namespace=None, watch=False, timeout_seconds=600) Get the created InferenceService in the specified namespace Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' ) The API also support watching the specified InferenceService or all InferenceService in the namespace. from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' , watch = True , timeout_seconds = 120 ) The outputs will be as following. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . NAME READY DEFAULT_TRAFFIC CANARY_TRAFFIC URL flower-sample Unknown http://flower-sample.kubeflow.example.com flower-sample Unknown 90 10 http://flower-sample.kubeflow.example.com flower-sample True 90 10 http://flower-sample.kubeflow.example.com Parameters \u00b6 Name Type Description Notes name str InferenceService name. If the name is not specified, it will get or watch all InferenceServices in the namespace. Optional. namespace str The InferenceService's namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService or all InferenceService in the namespace if True , otherwise will return object for the specified InferenceService or all InferenceService in the namespace. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the speficed InferenceService overall status READY is True (Only if the name is speficed). Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object patch \u00b6 patch(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Patch the created InferenceService in the specified namespace. Note that if you want to set the field from existing value to None , patch API may not work, you need to use replace API to remove the field value. Example \u00b6 from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 10 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . patch ( service_name , isvc ) # The API also supports watching the patached InferenceService status till it's READY. # kserve.patch('flower-sample', isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace for patching. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the patched InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object replace \u00b6 replace(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Replace the created InferenceService in the specified namespace. Generally use the replace API to update whole InferenceService or remove a field such as canary or other components of the InferenceService. Example \u00b6 from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 0 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . replace ( service_name , isvc ) # The API also supports watching the replaced InferenceService status till it's READY. # kserve.replace('flower-sample', isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the replaced InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object delete \u00b6 delete(name, namespace=None) Delete the created InferenceService in the specified namespace Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . delete ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str InferenceService name namespace str The inferenceservice's namespace. Defaults to current or default namespace. Optional Return type \u00b6 object wait_isvc_ready \u00b6 wait_isvc_ready(name, namespace=None, watch=False, timeout_seconds=600, polling_interval=10): Wait for the InferenceService to be ready. Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . wait_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService if True . Optional timeout_seconds int How long to wait for the InferenceService, default wait for 600 seconds. Optional polling_interval int How often to poll for the status of the InferenceService. Optional Return type \u00b6 object is_isvc_ready \u00b6 is_isvc_ready(name, namespace=None) Returns True if the InferenceService is ready; false otherwise. Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . is_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional Return type \u00b6 Bool","title":"KServeClient"},{"location":"sdk_docs/docs/KServeClient/#kserveclient","text":"KServeClient(config_file=None, context=None, client_configuration=None, persist_config=True) User can loads authentication and cluster information from kube-config file and stores them in kubernetes.client.configuration. Parameters are as following: parameter Description config_file Name of the kube-config file. Defaults to ~/.kube/config . Note that for the case that the SDK is running in cluster and you want to operate KServe in another remote cluster, user must set config_file to load kube-config file explicitly, e.g. KServeClient(config_file=\"~/.kube/config\") . context Set the active context. If is set to None, current_context from config file will be used. client_configuration The kubernetes.client.Configuration to set configs to. persist_config If True, config file will be updated when changed (e.g GCP token refresh). The APIs for KServeClient are as following: Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready","title":"KServeClient"},{"location":"sdk_docs/docs/KServeClient/#set_credentials","text":"set_credentials(storage_type, namespace=None, credentials_file=None, service_account='kfserving-service-credentials', **kwargs): Create or update a Secret and Service Account for GCS and S3 for the provided credentials. Once the Service Account is applied, it may be used in the Service Account field of a InferenceService's V1beta1ModelSpec .","title":"set_credentials"},{"location":"sdk_docs/docs/KServeClient/#example","text":"Example for creating GCP credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'GCS' , namespace = 'kubeflow' , credentials_file = '/tmp/gcp.json' , service_account = 'user_specified_sa_name' ) The API supports specifying a Service Account by service_account , or using default Service Account kfserving-service-credentials , if the Service Account does not exist, the API will create it and attach the created secret with the Service Account, if exists, only patch it to attach the created Secret. Example for creating S3 credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'S3' , namespace = 'kubeflow' , credentials_file = '/tmp/awcredentials' , s3_profile = 'default' , s3_endpoint = 's3.us-west-amazonaws.com' , s3_region = 'us-west-2' , s3_use_https = '1' , s3_verify_ssl = '0' ) Example for creating Azure credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'Azure' , namespace = 'kubeflow' , credentials_file = '/path/azure_credentials.json' ) The created or patched Secret and Service Account will be shown as following: INFO:kfserving.api.set_credentials:Created Secret: kfserving-secret-6tv6l in namespace kubeflow INFO:kfserving.api.set_credentials:Created (or Patched) Service account: kfserving-service-credentials in namespace kubeflow","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters","text":"Name Type Storage Type Description storage_type str All Required. Valid values: GCS, S3 or Azure namespace str All Optional. The kubernetes namespace. Defaults to current or default namespace. credentials_file str All Optional. The path for the credentials file. The default file for GCS is ~/.config/gcloud/application_default_credentials.json , see the instructions on creating the GCS credentials file. For S3 is ~/.aws/credentials , see the instructions on creating the S3 credentials file. For Azure is ~/.azure/azure_credentials.json , see the instructions on creating the Azure credentials file. service_account str All Optional. The name of service account. Supports specifying the service_account , or using default Service Account kfserving-service-credentials . If the Service Account does not exist, the API will create it and attach the created Secret with the Service Account, if exists, only patch it to attach the created Secret. s3_endpoint str S3 only Optional. The S3 endpoint. s3_region str S3 only Optional. The S3 region By default, regional endpoint is used for S3. s3_use_https str S3 only Optional. HTTPS is used to access S3 by default, unless s3_use_https=0 s3_verify_ssl str S3 only Optional. If HTTPS is used, SSL verification could be disabled with s3_verify_ssl=0","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#create","text":"create(inferenceservice, namespace=None, watch=False, timeout_seconds=600) Create the provided InferenceService in the specified namespace","title":"create"},{"location":"sdk_docs/docs/KServeClient/#example_1","text":"from kubernetes import client from kserve import KServeClient from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = default_model_spec ) kserve = KServeClient () kserve . create ( isvc ) # The API also supports watching the created InferenceService status till it's READY. # kserve.create(isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_1","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str Namespace for InferenceService deploying to. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the created InferenceService if True , otherwise will return the created InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#get","text":"get(name=None, namespace=None, watch=False, timeout_seconds=600) Get the created InferenceService in the specified namespace","title":"get"},{"location":"sdk_docs/docs/KServeClient/#example_2","text":"from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' ) The API also support watching the specified InferenceService or all InferenceService in the namespace. from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' , watch = True , timeout_seconds = 120 ) The outputs will be as following. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . NAME READY DEFAULT_TRAFFIC CANARY_TRAFFIC URL flower-sample Unknown http://flower-sample.kubeflow.example.com flower-sample Unknown 90 10 http://flower-sample.kubeflow.example.com flower-sample True 90 10 http://flower-sample.kubeflow.example.com","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_2","text":"Name Type Description Notes name str InferenceService name. If the name is not specified, it will get or watch all InferenceServices in the namespace. Optional. namespace str The InferenceService's namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService or all InferenceService in the namespace if True , otherwise will return object for the specified InferenceService or all InferenceService in the namespace. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the speficed InferenceService overall status READY is True (Only if the name is speficed). Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_1","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#patch","text":"patch(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Patch the created InferenceService in the specified namespace. Note that if you want to set the field from existing value to None , patch API may not work, you need to use replace API to remove the field value.","title":"patch"},{"location":"sdk_docs/docs/KServeClient/#example_3","text":"from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 10 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . patch ( service_name , isvc ) # The API also supports watching the patached InferenceService status till it's READY. # kserve.patch('flower-sample', isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_3","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace for patching. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the patched InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_2","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#replace","text":"replace(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Replace the created InferenceService in the specified namespace. Generally use the replace API to update whole InferenceService or remove a field such as canary or other components of the InferenceService.","title":"replace"},{"location":"sdk_docs/docs/KServeClient/#example_4","text":"from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 0 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . replace ( service_name , isvc ) # The API also supports watching the replaced InferenceService status till it's READY. # kserve.replace('flower-sample', isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_4","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the replaced InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_3","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#delete","text":"delete(name, namespace=None) Delete the created InferenceService in the specified namespace","title":"delete"},{"location":"sdk_docs/docs/KServeClient/#example_5","text":"from kserve import KServeClient kserve = KServeClient () kserve . delete ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_5","text":"Name Type Description Notes name str InferenceService name namespace str The inferenceservice's namespace. Defaults to current or default namespace. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_4","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#wait_isvc_ready","text":"wait_isvc_ready(name, namespace=None, watch=False, timeout_seconds=600, polling_interval=10): Wait for the InferenceService to be ready.","title":"wait_isvc_ready"},{"location":"sdk_docs/docs/KServeClient/#example_6","text":"from kserve import KServeClient kserve = KServeClient () kserve . wait_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_6","text":"Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService if True . Optional timeout_seconds int How long to wait for the InferenceService, default wait for 600 seconds. Optional polling_interval int How often to poll for the status of the InferenceService. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_5","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#is_isvc_ready","text":"is_isvc_ready(name, namespace=None) Returns True if the InferenceService is ready; false otherwise.","title":"is_isvc_ready"},{"location":"sdk_docs/docs/KServeClient/#example_7","text":"from kserve import KServeClient kserve = KServeClient () kserve . is_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_7","text":"Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_6","text":"Bool","title":"Return type"},{"location":"sdk_docs/docs/KnativeAddressable/","text":"KnativeAddressable \u00b6 Properties \u00b6 Name Type Description Notes url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"KnativeAddressable"},{"location":"sdk_docs/docs/KnativeAddressable/#knativeaddressable","text":"","title":"KnativeAddressable"},{"location":"sdk_docs/docs/KnativeAddressable/#properties","text":"Name Type Description Notes url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeCondition/","text":"KnativeCondition \u00b6 Conditions defines a readiness condition for a Knative resource. See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties Properties \u00b6 Name Type Description Notes last_transition_time KnativeVolatileTime LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). [optional] message str A human readable message indicating details about the transition. [optional] reason str The reason for the condition's last transition. [optional] severity str Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. [optional] status str Status of the condition, one of True, False, Unknown. type str Type of condition. [Back to Model list] [Back to API list] [Back to README]","title":"KnativeCondition"},{"location":"sdk_docs/docs/KnativeCondition/#knativecondition","text":"Conditions defines a readiness condition for a Knative resource. See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties","title":"KnativeCondition"},{"location":"sdk_docs/docs/KnativeCondition/#properties","text":"Name Type Description Notes last_transition_time KnativeVolatileTime LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). [optional] message str A human readable message indicating details about the transition. [optional] reason str The reason for the condition's last transition. [optional] severity str Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. [optional] status str Status of the condition, one of True, False, Unknown. type str Type of condition. [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeStatus/","text":"KnativeStatus \u00b6 Properties \u00b6 Name Type Description Notes conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"KnativeStatus"},{"location":"sdk_docs/docs/KnativeStatus/#knativestatus","text":"","title":"KnativeStatus"},{"location":"sdk_docs/docs/KnativeStatus/#properties","text":"Name Type Description Notes conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeURL/","text":"KnativeURL \u00b6 URL is an alias of url.URL. It has custom json marshal methods that enable it to be used in K8s CRDs such that the CRD resource will have the URL but operator code can can work with url.URL struct Properties \u00b6 Name Type Description Notes force_query bool encoded path hint (see EscapedPath method) fragment str encoded query values, without '?' host str username and password information opaque str path str host or host:port raw_path str path (relative paths may omit leading slash) raw_query str append a query ('?') even if RawQuery is empty scheme str user NetUrlUserinfo encoded opaque data [Back to Model list] [Back to API list] [Back to README]","title":"KnativeURL"},{"location":"sdk_docs/docs/KnativeURL/#knativeurl","text":"URL is an alias of url.URL. It has custom json marshal methods that enable it to be used in K8s CRDs such that the CRD resource will have the URL but operator code can can work with url.URL struct","title":"KnativeURL"},{"location":"sdk_docs/docs/KnativeURL/#properties","text":"Name Type Description Notes force_query bool encoded path hint (see EscapedPath method) fragment str encoded query values, without '?' host str username and password information opaque str path str host or host:port raw_path str path (relative paths may omit leading slash) raw_query str append a query ('?') even if RawQuery is empty scheme str user NetUrlUserinfo encoded opaque data [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeVolatileTime/","text":"KnativeVolatileTime \u00b6 VolatileTime wraps metav1.Time Properties \u00b6 Name Type Description Notes time datetime [Back to Model list] [Back to API list] [Back to README]","title":"KnativeVolatileTime"},{"location":"sdk_docs/docs/KnativeVolatileTime/#knativevolatiletime","text":"VolatileTime wraps metav1.Time","title":"KnativeVolatileTime"},{"location":"sdk_docs/docs/KnativeVolatileTime/#properties","text":"Name Type Description Notes time datetime [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/NetUrlUserinfo/","text":"NetUrlUserinfo \u00b6 Properties \u00b6 Name Type Description Notes password str password_set bool username str [Back to Model list] [Back to API list] [Back to README]","title":"NetUrlUserinfo"},{"location":"sdk_docs/docs/NetUrlUserinfo/#neturluserinfo","text":"","title":"NetUrlUserinfo"},{"location":"sdk_docs/docs/NetUrlUserinfo/#properties","text":"Name Type Description Notes password str password_set bool username str [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1Time/","text":"V1Time \u00b6 Properties \u00b6 Name Type Description Notes [Back to Model list] [Back to API list] [Back to README]","title":"V1Time"},{"location":"sdk_docs/docs/V1Time/#v1time","text":"","title":"V1Time"},{"location":"sdk_docs/docs/V1Time/#properties","text":"Name Type Description Notes [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/","text":"V1alpha1BuiltInAdapter \u00b6 Properties \u00b6 Name Type Description Notes env list[V1EnvVar] Environment variables used to control other aspects of the built-in adapter's behaviour (uncommon) [optional] mem_buffer_bytes int Fixed memory overhead to subtract from runtime container's memory allocation to determine model capacity [optional] model_loading_timeout_millis int Timeout for model loading operations in milliseconds [optional] runtime_management_port int Port which the runtime server listens for model management requests [optional] server_type str ServerType must be one of the supported built-in types such as &quot;triton&quot; or &quot;mlserver&quot;, and the runtime's container must have the same name [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1BuiltInAdapter"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/#v1alpha1builtinadapter","text":"","title":"V1alpha1BuiltInAdapter"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/#properties","text":"Name Type Description Notes env list[V1EnvVar] Environment variables used to control other aspects of the built-in adapter's behaviour (uncommon) [optional] mem_buffer_bytes int Fixed memory overhead to subtract from runtime container's memory allocation to determine model capacity [optional] model_loading_timeout_millis int Timeout for model loading operations in milliseconds [optional] runtime_management_port int Port which the runtime server listens for model management requests [optional] server_type str ServerType must be one of the supported built-in types such as &quot;triton&quot; or &quot;mlserver&quot;, and the runtime's container must have the same name [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/","text":"V1alpha1ClusterServingRuntime \u00b6 ClusterServingRuntime is the Schema for the servingruntimes API Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ClusterServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/#v1alpha1clusterservingruntime","text":"ClusterServingRuntime is the Schema for the servingruntimes API","title":"V1alpha1ClusterServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/","text":"V1alpha1ClusterServingRuntimeList \u00b6 ServingRuntimeList contains a list of ServingRuntime Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ClusterServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ClusterServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/#v1alpha1clusterservingruntimelist","text":"ServingRuntimeList contains a list of ServingRuntime","title":"V1alpha1ClusterServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ClusterServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1Container/","text":"V1alpha1Container \u00b6 Properties \u00b6 Name Type Description Notes args list[str] [optional] command list[str] [optional] env list[V1EnvVar] [optional] image str [optional] image_pull_policy str [optional] liveness_probe V1Probe [optional] name str [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] working_dir str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1Container"},{"location":"sdk_docs/docs/V1alpha1Container/#v1alpha1container","text":"","title":"V1alpha1Container"},{"location":"sdk_docs/docs/V1alpha1Container/#properties","text":"Name Type Description Notes args list[str] [optional] command list[str] [optional] env list[V1EnvVar] [optional] image str [optional] image_pull_policy str [optional] liveness_probe V1Probe [optional] name str [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] working_dir str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/","text":"V1alpha1InferenceGraph \u00b6 InferenceGraph is the Schema for the InferenceGraph API for multiple models Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1InferenceGraphSpec [optional] status V1alpha1InferenceGraphStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraph"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/#v1alpha1inferencegraph","text":"InferenceGraph is the Schema for the InferenceGraph API for multiple models","title":"V1alpha1InferenceGraph"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1InferenceGraphSpec [optional] status V1alpha1InferenceGraphStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/","text":"V1alpha1InferenceGraphList \u00b6 InferenceGraphList contains a list of InferenceGraph Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1InferenceGraph] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphList"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/#v1alpha1inferencegraphlist","text":"InferenceGraphList contains a list of InferenceGraph","title":"V1alpha1InferenceGraphList"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1InferenceGraph] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/","text":"V1alpha1InferenceGraphSpec \u00b6 InferenceGraphSpec defines the InferenceGraph spec Properties \u00b6 Name Type Description Notes nodes dict(str, V1alpha1InferenceRouter) Map of InferenceGraph router nodes Each node defines the router which can be different routing types [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphSpec"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/#v1alpha1inferencegraphspec","text":"InferenceGraphSpec defines the InferenceGraph spec","title":"V1alpha1InferenceGraphSpec"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/#properties","text":"Name Type Description Notes nodes dict(str, V1alpha1InferenceRouter) Map of InferenceGraph router nodes Each node defines the router which can be different routing types [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/","text":"V1alpha1InferenceGraphStatus \u00b6 InferenceGraphStatus defines the InferenceGraph conditions and status Properties \u00b6 Name Type Description Notes annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphStatus"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/#v1alpha1inferencegraphstatus","text":"InferenceGraphStatus defines the InferenceGraph conditions and status","title":"V1alpha1InferenceGraphStatus"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/#properties","text":"Name Type Description Notes annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/","text":"V1alpha1InferenceRouter \u00b6 InferenceRouter defines the router for each InferenceGraph node with one or multiple steps yaml kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 yaml kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \\\"{ .input.userId == 1 }\\\" - service: mymodel2 condition: \\\"{ .input.userId == 2 }\\\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \"majority-vote\" method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. yaml kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. yaml kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. yaml kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \\\"dog\\\" } - service: cat-breed-classifier condition: { .predictions.class == \\\"cat\\\" } Properties \u00b6 Name Type Description Notes router_type str RouterType - `Sequence:` chain multiple inference steps with input/output from previous step - `Splitter:` randomly routes to the target service according to the weight - `Ensemble:` routes the request to multiple models and then merge the responses - `Switch:` routes the request to one of the steps based on condition [default to ''] steps list[V1alpha1InferenceStep] Steps defines destinations for the current router node [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceRouter"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/#v1alpha1inferencerouter","text":"InferenceRouter defines the router for each InferenceGraph node with one or multiple steps yaml kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 yaml kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \\\"{ .input.userId == 1 }\\\" - service: mymodel2 condition: \\\"{ .input.userId == 2 }\\\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \"majority-vote\" method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. yaml kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. yaml kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. yaml kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \\\"dog\\\" } - service: cat-breed-classifier condition: { .predictions.class == \\\"cat\\\" }","title":"V1alpha1InferenceRouter"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/#properties","text":"Name Type Description Notes router_type str RouterType - `Sequence:` chain multiple inference steps with input/output from previous step - `Splitter:` randomly routes to the target service according to the weight - `Ensemble:` routes the request to multiple models and then merge the responses - `Switch:` routes the request to one of the steps based on condition [default to ''] steps list[V1alpha1InferenceStep] Steps defines destinations for the current router node [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/","text":"V1alpha1InferenceStep \u00b6 InferenceStep defines the inference target of the current step with condition, weights and data. Properties \u00b6 Name Type Description Notes condition str routing based on the condition [optional] data str request data sent to the next route with input/output from the previous step $request $response.predictions [optional] name str Unique name for the step within this node [optional] node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] weight int the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceStep"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/#v1alpha1inferencestep","text":"InferenceStep defines the inference target of the current step with condition, weights and data.","title":"V1alpha1InferenceStep"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/#properties","text":"Name Type Description Notes condition str routing based on the condition [optional] data str request data sent to the next route with input/output from the previous step $request $response.predictions [optional] name str Unique name for the step within this node [optional] node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] weight int the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/","text":"V1alpha1InferenceTarget \u00b6 Exactly one InferenceTarget field must be specified Properties \u00b6 Name Type Description Notes node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceTarget"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/#v1alpha1inferencetarget","text":"Exactly one InferenceTarget field must be specified","title":"V1alpha1InferenceTarget"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/#properties","text":"Name Type Description Notes node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/","text":"V1alpha1ServingRuntime \u00b6 ServingRuntime is the Schema for the servingruntimes API Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/#v1alpha1servingruntime","text":"ServingRuntime is the Schema for the servingruntimes API","title":"V1alpha1ServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/","text":"V1alpha1ServingRuntimeList \u00b6 ServingRuntimeList contains a list of ServingRuntime Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/#v1alpha1servingruntimelist","text":"ServingRuntimeList contains a list of ServingRuntime","title":"V1alpha1ServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/","text":"V1alpha1ServingRuntimePodSpec \u00b6 Properties \u00b6 Name Type Description Notes affinity V1Affinity [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimePodSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/#v1alpha1servingruntimepodspec","text":"","title":"V1alpha1ServingRuntimePodSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/#properties","text":"Name Type Description Notes affinity V1Affinity [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/","text":"V1alpha1ServingRuntimeSpec \u00b6 ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out. Properties \u00b6 Name Type Description Notes affinity V1Affinity [optional] built_in_adapter V1alpha1BuiltInAdapter [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. disabled bool Set to true to disable use of this runtime [optional] grpc_data_endpoint str Grpc endpoint for inferencing [optional] grpc_endpoint str Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted [optional] http_data_endpoint str HTTP endpoint for inferencing [optional] multi_model bool Whether this ServingRuntime is intended for multi-model usage or not. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] protocol_versions list[str] Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] replicas int Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value [optional] storage_helper V1alpha1StorageHelper [optional] supported_model_formats list[V1alpha1SupportedModelFormat] Model formats and version supported by this runtime [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimeSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/#v1alpha1servingruntimespec","text":"ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out.","title":"V1alpha1ServingRuntimeSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/#properties","text":"Name Type Description Notes affinity V1Affinity [optional] built_in_adapter V1alpha1BuiltInAdapter [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. disabled bool Set to true to disable use of this runtime [optional] grpc_data_endpoint str Grpc endpoint for inferencing [optional] grpc_endpoint str Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted [optional] http_data_endpoint str HTTP endpoint for inferencing [optional] multi_model bool Whether this ServingRuntime is intended for multi-model usage or not. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] protocol_versions list[str] Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] replicas int Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value [optional] storage_helper V1alpha1StorageHelper [optional] supported_model_formats list[V1alpha1SupportedModelFormat] Model formats and version supported by this runtime [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/","text":"V1alpha1StorageHelper \u00b6 Properties \u00b6 Name Type Description Notes disabled bool [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1StorageHelper"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/#v1alpha1storagehelper","text":"","title":"V1alpha1StorageHelper"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/#properties","text":"Name Type Description Notes disabled bool [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/","text":"V1alpha1SupportedModelFormat \u00b6 Properties \u00b6 Name Type Description Notes auto_select bool Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. [optional] name str Name of the model format. optional version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be &quot;major&quot;, &quot;major.minor&quot; or &quot;major.minor.patch&quot;. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1SupportedModelFormat"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/#v1alpha1supportedmodelformat","text":"","title":"V1alpha1SupportedModelFormat"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/#properties","text":"Name Type Description Notes auto_select bool Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. [optional] name str Name of the model format. optional version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be &quot;major&quot;, &quot;major.minor&quot; or &quot;major.minor.patch&quot;. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/","text":"V1beta1AIXExplainerSpec \u00b6 AIXExplainerSpec defines the arguments for configuring an AIX Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of AIX explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1AIXExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/#v1beta1aixexplainerspec","text":"AIXExplainerSpec defines the arguments for configuring an AIX Explanation Server","title":"V1beta1AIXExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of AIX explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/","text":"V1beta1ARTExplainerSpec \u00b6 ARTExplainerType defines the arguments for configuring an ART Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of ART explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ARTExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/#v1beta1artexplainerspec","text":"ARTExplainerType defines the arguments for configuring an ART Explanation Server","title":"V1beta1ARTExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of ART explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/","text":"V1beta1AlibiExplainerSpec \u00b6 AlibiExplainerSpec defines the arguments for configuring an Alibi Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of Alibi explainer <br /> Valid values are: <br /> - &quot;AnchorTabular&quot;; <br /> - &quot;AnchorImages&quot;; <br /> - &quot;AnchorText&quot;; <br /> - &quot;Counterfactuals&quot;; <br /> - &quot;Contrastive&quot;; <br /> [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1AlibiExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/#v1beta1alibiexplainerspec","text":"AlibiExplainerSpec defines the arguments for configuring an Alibi Explanation Server","title":"V1beta1AlibiExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of Alibi explainer <br /> Valid values are: <br /> - &quot;AnchorTabular&quot;; <br /> - &quot;AnchorImages&quot;; <br /> - &quot;AnchorText&quot;; <br /> - &quot;Counterfactuals&quot;; <br /> - &quot;Contrastive&quot;; <br /> [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1Batcher/","text":"V1beta1Batcher \u00b6 Batcher specifies optional payload batching available for all components Properties \u00b6 Name Type Description Notes max_batch_size int Specifies the max number of requests to trigger a batch [optional] max_latency int Specifies the max latency to trigger a batch [optional] timeout int Specifies the timeout of a batch [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1Batcher"},{"location":"sdk_docs/docs/V1beta1Batcher/#v1beta1batcher","text":"Batcher specifies optional payload batching available for all components","title":"V1beta1Batcher"},{"location":"sdk_docs/docs/V1beta1Batcher/#properties","text":"Name Type Description Notes max_batch_size int Specifies the max number of requests to trigger a batch [optional] max_latency int Specifies the max latency to trigger a batch [optional] timeout int Specifies the timeout of a batch [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/","text":"V1beta1ComponentExtensionSpec \u00b6 ComponentExtensionSpec defines the deployment configuration for a given InferenceService component Properties \u00b6 Name Type Description Notes batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ComponentExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/#v1beta1componentextensionspec","text":"ComponentExtensionSpec defines the deployment configuration for a given InferenceService component","title":"V1beta1ComponentExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/#properties","text":"Name Type Description Notes batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/","text":"V1beta1ComponentStatusSpec \u00b6 ComponentStatusSpec describes the state of the component Properties \u00b6 Name Type Description Notes address KnativeAddressable [optional] grpc_url KnativeURL [optional] latest_created_revision str Latest revision name that is created [optional] latest_ready_revision str Latest revision name that is in ready state [optional] latest_rolledout_revision str Latest revision name that is rolled out with 100 percent traffic [optional] previous_rolledout_revision str Previous revision name that is rolled out with 100 percent traffic [optional] rest_url KnativeURL [optional] traffic list[KnativeDevServingPkgApisServingV1TrafficTarget] Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ComponentStatusSpec"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/#v1beta1componentstatusspec","text":"ComponentStatusSpec describes the state of the component","title":"V1beta1ComponentStatusSpec"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/#properties","text":"Name Type Description Notes address KnativeAddressable [optional] grpc_url KnativeURL [optional] latest_created_revision str Latest revision name that is created [optional] latest_ready_revision str Latest revision name that is in ready state [optional] latest_rolledout_revision str Latest revision name that is rolled out with 100 percent traffic [optional] previous_rolledout_revision str Previous revision name that is rolled out with 100 percent traffic [optional] rest_url KnativeURL [optional] traffic list[KnativeDevServingPkgApisServingV1TrafficTarget] Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/","text":"V1beta1CustomExplainer \u00b6 CustomExplainer defines arguments for configuring a custom explainer. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomExplainer"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/#v1beta1customexplainer","text":"CustomExplainer defines arguments for configuring a custom explainer.","title":"V1beta1CustomExplainer"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/","text":"V1beta1CustomPredictor \u00b6 CustomPredictor defines arguments for configuring a custom server. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomPredictor"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/#v1beta1custompredictor","text":"CustomPredictor defines arguments for configuring a custom server.","title":"V1beta1CustomPredictor"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/","text":"V1beta1CustomTransformer \u00b6 CustomTransformer defines arguments for configuring a custom transformer. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomTransformer"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/#v1beta1customtransformer","text":"CustomTransformer defines arguments for configuring a custom transformer.","title":"V1beta1CustomTransformer"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/","text":"V1beta1ExplainerConfig \u00b6 Properties \u00b6 Name Type Description Notes default_image_version str default explainer docker image version [default to ''] image str explainer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerConfig"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/#v1beta1explainerconfig","text":"","title":"V1beta1ExplainerConfig"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/#properties","text":"Name Type Description Notes default_image_version str default explainer docker image version [default to ''] image str explainer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/","text":"V1beta1ExplainerExtensionSpec \u00b6 ExplainerExtensionSpec defines configuration shared across all explainer frameworks Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/#v1beta1explainerextensionspec","text":"ExplainerExtensionSpec defines configuration shared across all explainer frameworks","title":"V1beta1ExplainerExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/","text":"V1beta1ExplainerSpec \u00b6 ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] aix V1beta1AIXExplainerSpec [optional] alibi V1beta1AlibiExplainerSpec [optional] art V1beta1ARTExplainerSpec [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/#v1beta1explainerspec","text":"ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec.","title":"V1beta1ExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] aix V1beta1AIXExplainerSpec [optional] alibi V1beta1AlibiExplainerSpec [optional] art V1beta1ARTExplainerSpec [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainersConfig/","text":"V1beta1ExplainersConfig \u00b6 Properties \u00b6 Name Type Description Notes aix V1beta1ExplainerConfig [optional] alibi V1beta1ExplainerConfig [optional] art V1beta1ExplainerConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainersConfig"},{"location":"sdk_docs/docs/V1beta1ExplainersConfig/#v1beta1explainersconfig","text":"","title":"V1beta1ExplainersConfig"},{"location":"sdk_docs/docs/V1beta1ExplainersConfig/#properties","text":"Name Type Description Notes aix V1beta1ExplainerConfig [optional] alibi V1beta1ExplainerConfig [optional] art V1beta1ExplainerConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1FailureInfo/","text":"V1beta1FailureInfo \u00b6 Properties \u00b6 Name Type Description Notes location str Name of component to which the failure relates (usually Pod name) [optional] message str Detailed error message [optional] model_revision_name str Internal Revision/ID of model, tied to specific Spec contents [optional] reason str High level class of failure [optional] time V1Time [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1FailureInfo"},{"location":"sdk_docs/docs/V1beta1FailureInfo/#v1beta1failureinfo","text":"","title":"V1beta1FailureInfo"},{"location":"sdk_docs/docs/V1beta1FailureInfo/#properties","text":"Name Type Description Notes location str Name of component to which the failure relates (usually Pod name) [optional] message str Detailed error message [optional] model_revision_name str Internal Revision/ID of model, tied to specific Spec contents [optional] reason str High level class of failure [optional] time V1Time [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceService/","text":"V1beta1InferenceService \u00b6 InferenceService is the Schema for the InferenceServices API Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1beta1InferenceServiceSpec [optional] status V1beta1InferenceServiceStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceService"},{"location":"sdk_docs/docs/V1beta1InferenceService/#v1beta1inferenceservice","text":"InferenceService is the Schema for the InferenceServices API","title":"V1beta1InferenceService"},{"location":"sdk_docs/docs/V1beta1InferenceService/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1beta1InferenceServiceSpec [optional] status V1beta1InferenceServiceStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceServiceList/","text":"V1beta1InferenceServiceList \u00b6 InferenceServiceList contains a list of Service Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1beta1InferenceService] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceServiceList"},{"location":"sdk_docs/docs/V1beta1InferenceServiceList/#v1beta1inferenceservicelist","text":"InferenceServiceList contains a list of Service","title":"V1beta1InferenceServiceList"},{"location":"sdk_docs/docs/V1beta1InferenceServiceList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1beta1InferenceService] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceServiceSpec/","text":"V1beta1InferenceServiceSpec \u00b6 InferenceServiceSpec is the top level type for this resource Properties \u00b6 Name Type Description Notes explainer V1beta1ExplainerSpec [optional] predictor V1beta1PredictorSpec transformer V1beta1TransformerSpec [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceServiceSpec"},{"location":"sdk_docs/docs/V1beta1InferenceServiceSpec/#v1beta1inferenceservicespec","text":"InferenceServiceSpec is the top level type for this resource","title":"V1beta1InferenceServiceSpec"},{"location":"sdk_docs/docs/V1beta1InferenceServiceSpec/#properties","text":"Name Type Description Notes explainer V1beta1ExplainerSpec [optional] predictor V1beta1PredictorSpec transformer V1beta1TransformerSpec [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceServiceStatus/","text":"V1beta1InferenceServiceStatus \u00b6 InferenceServiceStatus defines the observed state of InferenceService Properties \u00b6 Name Type Description Notes address KnativeAddressable [optional] annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] components dict(str, V1beta1ComponentStatusSpec) Statuses for the components of the InferenceService [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] model_status V1beta1ModelStatus [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceServiceStatus"},{"location":"sdk_docs/docs/V1beta1InferenceServiceStatus/#v1beta1inferenceservicestatus","text":"InferenceServiceStatus defines the observed state of InferenceService","title":"V1beta1InferenceServiceStatus"},{"location":"sdk_docs/docs/V1beta1InferenceServiceStatus/#properties","text":"Name Type Description Notes address KnativeAddressable [optional] annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] components dict(str, V1beta1ComponentStatusSpec) Statuses for the components of the InferenceService [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] model_status V1beta1ModelStatus [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceServicesConfig/","text":"V1beta1InferenceServicesConfig \u00b6 Properties \u00b6 Name Type Description Notes explainers V1beta1ExplainersConfig predictors V1beta1PredictorsConfig transformers V1beta1TransformersConfig [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceServicesConfig"},{"location":"sdk_docs/docs/V1beta1InferenceServicesConfig/#v1beta1inferenceservicesconfig","text":"","title":"V1beta1InferenceServicesConfig"},{"location":"sdk_docs/docs/V1beta1InferenceServicesConfig/#properties","text":"Name Type Description Notes explainers V1beta1ExplainersConfig predictors V1beta1PredictorsConfig transformers V1beta1TransformersConfig [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1IngressConfig/","text":"V1beta1IngressConfig \u00b6 Properties \u00b6 Name Type Description Notes domain_template str [optional] ingress_class_name str [optional] ingress_domain str [optional] ingress_gateway str [optional] ingress_service str [optional] local_gateway str [optional] local_gateway_service str [optional] url_scheme str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1IngressConfig"},{"location":"sdk_docs/docs/V1beta1IngressConfig/#v1beta1ingressconfig","text":"","title":"V1beta1IngressConfig"},{"location":"sdk_docs/docs/V1beta1IngressConfig/#properties","text":"Name Type Description Notes domain_template str [optional] ingress_class_name str [optional] ingress_domain str [optional] ingress_gateway str [optional] ingress_service str [optional] local_gateway str [optional] local_gateway_service str [optional] url_scheme str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1LightGBMSpec/","text":"V1beta1LightGBMSpec \u00b6 LightGBMSpec defines arguments for configuring LightGBMSpec model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1LightGBMSpec"},{"location":"sdk_docs/docs/V1beta1LightGBMSpec/#v1beta1lightgbmspec","text":"LightGBMSpec defines arguments for configuring LightGBMSpec model serving.","title":"V1beta1LightGBMSpec"},{"location":"sdk_docs/docs/V1beta1LightGBMSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1LoggerSpec/","text":"V1beta1LoggerSpec \u00b6 LoggerSpec specifies optional payload logging available for all components Properties \u00b6 Name Type Description Notes mode str Specifies the scope of the loggers. <br /> Valid values are: <br /> - &quot;all&quot; (default): log both request and response; <br /> - &quot;request&quot;: log only request; <br /> - &quot;response&quot;: log only response <br /> [optional] url str URL to send logging events [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1LoggerSpec"},{"location":"sdk_docs/docs/V1beta1LoggerSpec/#v1beta1loggerspec","text":"LoggerSpec specifies optional payload logging available for all components","title":"V1beta1LoggerSpec"},{"location":"sdk_docs/docs/V1beta1LoggerSpec/#properties","text":"Name Type Description Notes mode str Specifies the scope of the loggers. <br /> Valid values are: <br /> - &quot;all&quot; (default): log both request and response; <br /> - &quot;request&quot;: log only request; <br /> - &quot;response&quot;: log only response <br /> [optional] url str URL to send logging events [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelCopies/","text":"V1beta1ModelCopies \u00b6 Properties \u00b6 Name Type Description Notes failed_copies int How many copies of this predictor's models failed to load recently [default to 0] total_copies int Total number copies of this predictor's models that are currently loaded [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelCopies"},{"location":"sdk_docs/docs/V1beta1ModelCopies/#v1beta1modelcopies","text":"","title":"V1beta1ModelCopies"},{"location":"sdk_docs/docs/V1beta1ModelCopies/#properties","text":"Name Type Description Notes failed_copies int How many copies of this predictor's models failed to load recently [default to 0] total_copies int Total number copies of this predictor's models that are currently loaded [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelFormat/","text":"V1beta1ModelFormat \u00b6 Properties \u00b6 Name Type Description Notes name str Name of the model format. optional version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be &quot;major&quot;, &quot;major.minor&quot; or &quot;major.minor.patch&quot;. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelFormat"},{"location":"sdk_docs/docs/V1beta1ModelFormat/#v1beta1modelformat","text":"","title":"V1beta1ModelFormat"},{"location":"sdk_docs/docs/V1beta1ModelFormat/#properties","text":"Name Type Description Notes name str Name of the model format. optional version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be &quot;major&quot;, &quot;major.minor&quot; or &quot;major.minor.patch&quot;. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelRevisionStates/","text":"V1beta1ModelRevisionStates \u00b6 Properties \u00b6 Name Type Description Notes active_model_state str High level state string: Pending, Standby, Loading, Loaded, FailedToLoad [default to ''] target_model_state str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelRevisionStates"},{"location":"sdk_docs/docs/V1beta1ModelRevisionStates/#v1beta1modelrevisionstates","text":"","title":"V1beta1ModelRevisionStates"},{"location":"sdk_docs/docs/V1beta1ModelRevisionStates/#properties","text":"Name Type Description Notes active_model_state str High level state string: Pending, Standby, Loading, Loaded, FailedToLoad [default to ''] target_model_state str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelSpec/","text":"V1beta1ModelSpec \u00b6 Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] model_format V1beta1ModelFormat name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime str Specific ClusterServingRuntime/ServingRuntime name to use for deployment. [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelSpec"},{"location":"sdk_docs/docs/V1beta1ModelSpec/#v1beta1modelspec","text":"","title":"V1beta1ModelSpec"},{"location":"sdk_docs/docs/V1beta1ModelSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] model_format V1beta1ModelFormat name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime str Specific ClusterServingRuntime/ServingRuntime name to use for deployment. [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelStatus/","text":"V1beta1ModelStatus \u00b6 Properties \u00b6 Name Type Description Notes copies V1beta1ModelCopies [optional] last_failure_info V1beta1FailureInfo [optional] states V1beta1ModelRevisionStates [optional] transition_status str Whether the available predictor endpoints reflect the current Spec or is in transition [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelStatus"},{"location":"sdk_docs/docs/V1beta1ModelStatus/#v1beta1modelstatus","text":"","title":"V1beta1ModelStatus"},{"location":"sdk_docs/docs/V1beta1ModelStatus/#properties","text":"Name Type Description Notes copies V1beta1ModelCopies [optional] last_failure_info V1beta1FailureInfo [optional] states V1beta1ModelRevisionStates [optional] transition_status str Whether the available predictor endpoints reflect the current Spec or is in transition [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ONNXRuntimeSpec/","text":"V1beta1ONNXRuntimeSpec \u00b6 ONNXRuntimeSpec defines arguments for configuring ONNX model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ONNXRuntimeSpec"},{"location":"sdk_docs/docs/V1beta1ONNXRuntimeSpec/#v1beta1onnxruntimespec","text":"ONNXRuntimeSpec defines arguments for configuring ONNX model serving.","title":"V1beta1ONNXRuntimeSpec"},{"location":"sdk_docs/docs/V1beta1ONNXRuntimeSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PMMLSpec/","text":"V1beta1PMMLSpec \u00b6 PMMLSpec defines arguments for configuring PMML model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PMMLSpec"},{"location":"sdk_docs/docs/V1beta1PMMLSpec/#v1beta1pmmlspec","text":"PMMLSpec defines arguments for configuring PMML model serving.","title":"V1beta1PMMLSpec"},{"location":"sdk_docs/docs/V1beta1PMMLSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PaddleServerSpec/","text":"V1beta1PaddleServerSpec \u00b6 Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PaddleServerSpec"},{"location":"sdk_docs/docs/V1beta1PaddleServerSpec/#v1beta1paddleserverspec","text":"","title":"V1beta1PaddleServerSpec"},{"location":"sdk_docs/docs/V1beta1PaddleServerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PodSpec/","text":"V1beta1PodSpec \u00b6 PodSpec is a description of a pod. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PodSpec"},{"location":"sdk_docs/docs/V1beta1PodSpec/#v1beta1podspec","text":"PodSpec is a description of a pod.","title":"V1beta1PodSpec"},{"location":"sdk_docs/docs/V1beta1PodSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorConfig/","text":"V1beta1PredictorConfig \u00b6 Properties \u00b6 Name Type Description Notes default_gpu_image_version str default predictor docker image version on gpu [default to ''] default_image_version str default predictor docker image version on cpu [default to ''] default_timeout str Default timeout of predictor for serving a request, in seconds [optional] image str predictor docker image name [default to ''] multi_model_server bool Flag to determine if multi-model serving is supported [optional] supported_frameworks list[str] frameworks the model agent is able to run [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorConfig"},{"location":"sdk_docs/docs/V1beta1PredictorConfig/#v1beta1predictorconfig","text":"","title":"V1beta1PredictorConfig"},{"location":"sdk_docs/docs/V1beta1PredictorConfig/#properties","text":"Name Type Description Notes default_gpu_image_version str default predictor docker image version on gpu [default to ''] default_image_version str default predictor docker image version on cpu [default to ''] default_timeout str Default timeout of predictor for serving a request, in seconds [optional] image str predictor docker image name [default to ''] multi_model_server bool Flag to determine if multi-model serving is supported [optional] supported_frameworks list[str] frameworks the model agent is able to run [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorExtensionSpec/","text":"V1beta1PredictorExtensionSpec \u00b6 PredictorExtensionSpec defines configuration shared across all predictor frameworks Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorExtensionSpec"},{"location":"sdk_docs/docs/V1beta1PredictorExtensionSpec/#v1beta1predictorextensionspec","text":"PredictorExtensionSpec defines configuration shared across all predictor frameworks","title":"V1beta1PredictorExtensionSpec"},{"location":"sdk_docs/docs/V1beta1PredictorExtensionSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorProtocols/","text":"V1beta1PredictorProtocols \u00b6 Properties \u00b6 Name Type Description Notes v1 V1beta1PredictorConfig [optional] v2 V1beta1PredictorConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorProtocols"},{"location":"sdk_docs/docs/V1beta1PredictorProtocols/#v1beta1predictorprotocols","text":"","title":"V1beta1PredictorProtocols"},{"location":"sdk_docs/docs/V1beta1PredictorProtocols/#properties","text":"Name Type Description Notes v1 V1beta1PredictorConfig [optional] v2 V1beta1PredictorConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorSpec/","text":"V1beta1PredictorSpec \u00b6 PredictorSpec defines the configuration for a predictor, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] lightgbm V1beta1LightGBMSpec [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] model V1beta1ModelSpec [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] onnx V1beta1ONNXRuntimeSpec [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] paddle V1beta1PaddleServerSpec [optional] pmml V1beta1PMMLSpec [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] pytorch V1beta1TorchServeSpec [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] sklearn V1beta1SKLearnSpec [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] tensorflow V1beta1TFServingSpec [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] triton V1beta1TritonSpec [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] xgboost V1beta1XGBoostSpec [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorSpec"},{"location":"sdk_docs/docs/V1beta1PredictorSpec/#v1beta1predictorspec","text":"PredictorSpec defines the configuration for a predictor, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec.","title":"V1beta1PredictorSpec"},{"location":"sdk_docs/docs/V1beta1PredictorSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] lightgbm V1beta1LightGBMSpec [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] model V1beta1ModelSpec [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] onnx V1beta1ONNXRuntimeSpec [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] paddle V1beta1PaddleServerSpec [optional] pmml V1beta1PMMLSpec [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] pytorch V1beta1TorchServeSpec [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] sklearn V1beta1SKLearnSpec [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] tensorflow V1beta1TFServingSpec [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] triton V1beta1TritonSpec [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] xgboost V1beta1XGBoostSpec [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorsConfig/","text":"V1beta1PredictorsConfig \u00b6 Properties \u00b6 Name Type Description Notes lightgbm V1beta1PredictorConfig [optional] onnx V1beta1PredictorConfig [optional] paddle V1beta1PredictorConfig [optional] pmml V1beta1PredictorConfig [optional] pytorch V1beta1PredictorConfig [optional] sklearn V1beta1PredictorProtocols [optional] tensorflow V1beta1PredictorConfig [optional] triton V1beta1PredictorConfig [optional] xgboost V1beta1PredictorProtocols [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorsConfig"},{"location":"sdk_docs/docs/V1beta1PredictorsConfig/#v1beta1predictorsconfig","text":"","title":"V1beta1PredictorsConfig"},{"location":"sdk_docs/docs/V1beta1PredictorsConfig/#properties","text":"Name Type Description Notes lightgbm V1beta1PredictorConfig [optional] onnx V1beta1PredictorConfig [optional] paddle V1beta1PredictorConfig [optional] pmml V1beta1PredictorConfig [optional] pytorch V1beta1PredictorConfig [optional] sklearn V1beta1PredictorProtocols [optional] tensorflow V1beta1PredictorConfig [optional] triton V1beta1PredictorConfig [optional] xgboost V1beta1PredictorProtocols [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1SKLearnSpec/","text":"V1beta1SKLearnSpec \u00b6 SKLearnSpec defines arguments for configuring SKLearn model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1SKLearnSpec"},{"location":"sdk_docs/docs/V1beta1SKLearnSpec/#v1beta1sklearnspec","text":"SKLearnSpec defines arguments for configuring SKLearn model serving.","title":"V1beta1SKLearnSpec"},{"location":"sdk_docs/docs/V1beta1SKLearnSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1StorageSpec/","text":"V1beta1StorageSpec \u00b6 Properties \u00b6 Name Type Description Notes key str The Storage Key in the secret for this model. [optional] parameters dict(str, str) Parameters to override the default storage credentials and config. [optional] path str The path to the model object in the storage. It cannot co-exist with the storageURI. [optional] schema_path str The path to the model schema file in the storage. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1StorageSpec"},{"location":"sdk_docs/docs/V1beta1StorageSpec/#v1beta1storagespec","text":"","title":"V1beta1StorageSpec"},{"location":"sdk_docs/docs/V1beta1StorageSpec/#properties","text":"Name Type Description Notes key str The Storage Key in the secret for this model. [optional] parameters dict(str, str) Parameters to override the default storage credentials and config. [optional] path str The path to the model object in the storage. It cannot co-exist with the storageURI. [optional] schema_path str The path to the model schema file in the storage. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TFServingSpec/","text":"V1beta1TFServingSpec \u00b6 TFServingSpec defines arguments for configuring Tensorflow model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TFServingSpec"},{"location":"sdk_docs/docs/V1beta1TFServingSpec/#v1beta1tfservingspec","text":"TFServingSpec defines arguments for configuring Tensorflow model serving.","title":"V1beta1TFServingSpec"},{"location":"sdk_docs/docs/V1beta1TFServingSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TorchServeSpec/","text":"V1beta1TorchServeSpec \u00b6 TorchServeSpec defines arguments for configuring PyTorch model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TorchServeSpec"},{"location":"sdk_docs/docs/V1beta1TorchServeSpec/#v1beta1torchservespec","text":"TorchServeSpec defines arguments for configuring PyTorch model serving.","title":"V1beta1TorchServeSpec"},{"location":"sdk_docs/docs/V1beta1TorchServeSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TransformerConfig/","text":"V1beta1TransformerConfig \u00b6 Properties \u00b6 Name Type Description Notes default_image_version str default transformer docker image version [default to ''] image str transformer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TransformerConfig"},{"location":"sdk_docs/docs/V1beta1TransformerConfig/#v1beta1transformerconfig","text":"","title":"V1beta1TransformerConfig"},{"location":"sdk_docs/docs/V1beta1TransformerConfig/#properties","text":"Name Type Description Notes default_image_version str default transformer docker image version [default to ''] image str transformer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TransformerSpec/","text":"V1beta1TransformerSpec \u00b6 TransformerSpec defines transformer service for pre/post processing Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TransformerSpec"},{"location":"sdk_docs/docs/V1beta1TransformerSpec/#v1beta1transformerspec","text":"TransformerSpec defines transformer service for pre/post processing","title":"V1beta1TransformerSpec"},{"location":"sdk_docs/docs/V1beta1TransformerSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TransformersConfig/","text":"V1beta1TransformersConfig \u00b6 Properties \u00b6 Name Type Description Notes feast V1beta1TransformerConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TransformersConfig"},{"location":"sdk_docs/docs/V1beta1TransformersConfig/#v1beta1transformersconfig","text":"","title":"V1beta1TransformersConfig"},{"location":"sdk_docs/docs/V1beta1TransformersConfig/#properties","text":"Name Type Description Notes feast V1beta1TransformerConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TritonSpec/","text":"V1beta1TritonSpec \u00b6 TritonSpec defines arguments for configuring Triton model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TritonSpec"},{"location":"sdk_docs/docs/V1beta1TritonSpec/#v1beta1tritonspec","text":"TritonSpec defines arguments for configuring Triton model serving.","title":"V1beta1TritonSpec"},{"location":"sdk_docs/docs/V1beta1TritonSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1XGBoostSpec/","text":"V1beta1XGBoostSpec \u00b6 XGBoostSpec defines arguments for configuring XGBoost model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1XGBoostSpec"},{"location":"sdk_docs/docs/V1beta1XGBoostSpec/#v1beta1xgboostspec","text":"XGBoostSpec defines arguments for configuring XGBoost model serving.","title":"V1beta1XGBoostSpec"},{"location":"sdk_docs/docs/V1beta1XGBoostSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"}]}
\ No newline at end of file
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"","title":"Home"},{"location":"admin/kubernetes_deployment/","text":"Kubernetes Deployment Installation Guide \u00b6 KServe supports RawDeployment mode to enable InferenceService deployment with Kubernetes resources Deployment , Service , Ingress and Horizontal Pod Autoscaler . Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand Scale down and from Zero is not supported in RawDeployment mode. Kubernetes 1.22 is the minimally required version and please check the following recommended Istio versions for the corresponding Kubernetes version. Recommended Version Matrix \u00b6 Kubernetes Version Recommended Istio Version 1.27 1.18, 1.19 1.28 1.19, 1.20 1.29 1.20, 1.21 1. Install Istio \u00b6 The minimally required Istio version is 1.13 and you can refer to the Istio install guide . Once Istio is installed, create IngressClass resource for istio. apiVersion : networking.k8s.io/v1 kind : IngressClass metadata : name : istio spec : controller : istio.io/ingress-controller Note Istio ingress is recommended, but you can choose to install with other Ingress controllers and create IngressClass resource for your Ingress option. 2. Install Cert Manager \u00b6 The minimally required Cert Manager version is 1.15.0 and you can refer to Cert Manager installation guide . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script. 3. Install KServe \u00b6 Note The default KServe deployment mode is Serverless which depends on Knative. The following step changes the default deployment mode to RawDeployment before installing KServe. i. Install KServe kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml Install KServe default serving runtimes: kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml ii. Change default deployment mode and ingress option First in ConfigMap inferenceservice-config modify the defaultDeploymentMode in the deploy section, kubectl kubectl patch configmap/inferenceservice-config -n kserve --type = strategic -p '{\"data\": {\"deploy\": \"{\\\"defaultDeploymentMode\\\": \\\"RawDeployment\\\"}\"}}' then modify the ingressClassName in ingress section to point to IngressClass name created in step 1 . ingress : |- { \"ingressClassName\" : \"your-ingress-class\" , }","title":"Kubernetes deployment installation"},{"location":"admin/kubernetes_deployment/#kubernetes-deployment-installation-guide","text":"KServe supports RawDeployment mode to enable InferenceService deployment with Kubernetes resources Deployment , Service , Ingress and Horizontal Pod Autoscaler . Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand Scale down and from Zero is not supported in RawDeployment mode. Kubernetes 1.22 is the minimally required version and please check the following recommended Istio versions for the corresponding Kubernetes version.","title":"Kubernetes Deployment Installation Guide"},{"location":"admin/kubernetes_deployment/#recommended-version-matrix","text":"Kubernetes Version Recommended Istio Version 1.27 1.18, 1.19 1.28 1.19, 1.20 1.29 1.20, 1.21","title":"Recommended Version Matrix"},{"location":"admin/kubernetes_deployment/#1-install-istio","text":"The minimally required Istio version is 1.13 and you can refer to the Istio install guide . Once Istio is installed, create IngressClass resource for istio. apiVersion : networking.k8s.io/v1 kind : IngressClass metadata : name : istio spec : controller : istio.io/ingress-controller Note Istio ingress is recommended, but you can choose to install with other Ingress controllers and create IngressClass resource for your Ingress option.","title":"1. Install Istio"},{"location":"admin/kubernetes_deployment/#2-install-cert-manager","text":"The minimally required Cert Manager version is 1.15.0 and you can refer to Cert Manager installation guide . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script.","title":"2. Install Cert Manager"},{"location":"admin/kubernetes_deployment/#3-install-kserve","text":"Note The default KServe deployment mode is Serverless which depends on Knative. The following step changes the default deployment mode to RawDeployment before installing KServe. i. Install KServe kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml Install KServe default serving runtimes: kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml ii. Change default deployment mode and ingress option First in ConfigMap inferenceservice-config modify the defaultDeploymentMode in the deploy section, kubectl kubectl patch configmap/inferenceservice-config -n kserve --type = strategic -p '{\"data\": {\"deploy\": \"{\\\"defaultDeploymentMode\\\": \\\"RawDeployment\\\"}\"}}' then modify the ingressClassName in ingress section to point to IngressClass name created in step 1 . ingress : |- { \"ingressClassName\" : \"your-ingress-class\" , }","title":"3. Install KServe"},{"location":"admin/migration/","text":"Migrating from KFServing \u00b6 This doc explains how to migrate existing inference services from KFServing to KServe without downtime. Note The migration job will by default delete the leftover KFServing installation after migrating the inference services from serving.kubeflow.org to serving.kserve.io . Migrating from standalone KFServing \u00b6 Install KServe v0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve.yaml Run the KServe Migration YAML This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it If your KFServing is installed in a namespace other than kfserving-system , then download and set the env KFSERVING_NAMESPACE in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kserve Migrating from Kubeflow-based KFServing \u00b6 Install Kubeflow-based KServe 0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve_kubeflow.yaml Run the KServe Migration YAML for Kubeflow-based installations This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job_kubeflow.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kubeflow Update the models web app to use the new InferenceService API group serving.kserve.io Change the deployment image to kserve/models-web-app:v0.7.0-rc0 This is a temporary fix until the next Kubeflow release includes these changes kubectl edit deployment kfserving-models-web-app -n kubeflow Update the cluster role to be able to access the new InferenceService API group serving.kserve.io Edit the apiGroups from serving.kubeflow.org to serving.kserve.io This is a temporary fix until the next Kubeflow release includes these changes kubectl edit clusterrole kfserving-models-web-app-cluster-role","title":"Migrating from KFServing"},{"location":"admin/migration/#migrating-from-kfserving","text":"This doc explains how to migrate existing inference services from KFServing to KServe without downtime. Note The migration job will by default delete the leftover KFServing installation after migrating the inference services from serving.kubeflow.org to serving.kserve.io .","title":"Migrating from KFServing"},{"location":"admin/migration/#migrating-from-standalone-kfserving","text":"Install KServe v0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve.yaml Run the KServe Migration YAML This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it If your KFServing is installed in a namespace other than kfserving-system , then download and set the env KFSERVING_NAMESPACE in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kserve","title":"Migrating from standalone KFServing"},{"location":"admin/migration/#migrating-from-kubeflow-based-kfserving","text":"Install Kubeflow-based KServe 0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve_kubeflow.yaml Run the KServe Migration YAML for Kubeflow-based installations This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job_kubeflow.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kubeflow Update the models web app to use the new InferenceService API group serving.kserve.io Change the deployment image to kserve/models-web-app:v0.7.0-rc0 This is a temporary fix until the next Kubeflow release includes these changes kubectl edit deployment kfserving-models-web-app -n kubeflow Update the cluster role to be able to access the new InferenceService API group serving.kserve.io Edit the apiGroups from serving.kubeflow.org to serving.kserve.io This is a temporary fix until the next Kubeflow release includes these changes kubectl edit clusterrole kfserving-models-web-app-cluster-role","title":"Migrating from Kubeflow-based KFServing"},{"location":"admin/modelmesh/","text":"ModelMesh Installation Guide \u00b6 KServe ModelMesh installation enables high-scale, high-density and frequently-changing model serving use cases. A Kubernetes cluster is required. You will need cluster-admin authority. Additionally, kustomize and an etcd server on the Kubernetes cluster are required. 1. Standard Installation \u00b6 You can find the standard installation instructions in the ModelMesh Serving installation guide . This approach assumes you have installed the prerequisites such as etcd and S3-compatible object storage. 2. Quick Installation \u00b6 A quick installation allows you to quickly get ModelMesh Serving up and running without having to manually install the prerequisites. The steps are described in the ModelMesh Serving quick start guide . Note ModelMesh Serving is namespace scoped, meaning all of its components must exist within a single namespace and only one instance of ModelMesh Serving can be installed per namespace. For more details, you can check out the ModelMesh Serving getting started guide .","title":"ModelMesh installation"},{"location":"admin/modelmesh/#modelmesh-installation-guide","text":"KServe ModelMesh installation enables high-scale, high-density and frequently-changing model serving use cases. A Kubernetes cluster is required. You will need cluster-admin authority. Additionally, kustomize and an etcd server on the Kubernetes cluster are required.","title":"ModelMesh Installation Guide"},{"location":"admin/modelmesh/#1-standard-installation","text":"You can find the standard installation instructions in the ModelMesh Serving installation guide . This approach assumes you have installed the prerequisites such as etcd and S3-compatible object storage.","title":"1. Standard Installation"},{"location":"admin/modelmesh/#2-quick-installation","text":"A quick installation allows you to quickly get ModelMesh Serving up and running without having to manually install the prerequisites. The steps are described in the ModelMesh Serving quick start guide . Note ModelMesh Serving is namespace scoped, meaning all of its components must exist within a single namespace and only one instance of ModelMesh Serving can be installed per namespace. For more details, you can check out the ModelMesh Serving getting started guide .","title":"2. Quick Installation"},{"location":"admin/serverless/serverless/","text":"Serverless Installation Guide \u00b6 KServe Serverless installation enables autoscaling based on request volume and supports scale down to and from zero. It also supports revision management and canary rollout based on revisions. Kubernetes 1.28 is the minimally required version and please check the following recommended Knative, Istio versions for the corresponding Kubernetes version. Recommended Version Matrix \u00b6 Kubernetes Version Recommended Istio Version Recommended Knative Version 1.28 1.22 1.15 1.29 1.22,1.23 1.15,1.16 1.30 1.22,1.23 1.15,1.16 1. Install Knative Serving \u00b6 Please refer to Knative Serving install guide . Note If you are looking to use PodSpec fields such as nodeSelector, affinity or tolerations which are now supported in the v1beta1 API spec, you need to turn on the corresponding feature flags in your Knative configuration. Warning Knative 1.13.1 requires Istio 1.20+, gRPC routing does not work with previous Istio releases, see release notes . 2. Install Networking Layer \u00b6 The recommended networking layer for KServe is Istio as currently it works best with KServe, please refer to the Istio install guide . Alternatively you can also choose other networking layers like Kourier or Contour , see how to install Kourier with KServe guide . 3. Install Cert Manager \u00b6 The minimally required Cert Manager version is 1.15.0 and you can refer to Cert Manager . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script. 4. Install KServe \u00b6 kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml 5. Install KServe Built-in ClusterServingRuntimes \u00b6 0.13.0 kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml Note ClusterServingRuntimes are required to create InferenceService for built-in model serving runtimes with KServe v0.8.0 or higher.","title":"Serverless installation"},{"location":"admin/serverless/serverless/#serverless-installation-guide","text":"KServe Serverless installation enables autoscaling based on request volume and supports scale down to and from zero. It also supports revision management and canary rollout based on revisions. Kubernetes 1.28 is the minimally required version and please check the following recommended Knative, Istio versions for the corresponding Kubernetes version.","title":"Serverless Installation Guide"},{"location":"admin/serverless/serverless/#recommended-version-matrix","text":"Kubernetes Version Recommended Istio Version Recommended Knative Version 1.28 1.22 1.15 1.29 1.22,1.23 1.15,1.16 1.30 1.22,1.23 1.15,1.16","title":"Recommended Version Matrix"},{"location":"admin/serverless/serverless/#1-install-knative-serving","text":"Please refer to Knative Serving install guide . Note If you are looking to use PodSpec fields such as nodeSelector, affinity or tolerations which are now supported in the v1beta1 API spec, you need to turn on the corresponding feature flags in your Knative configuration. Warning Knative 1.13.1 requires Istio 1.20+, gRPC routing does not work with previous Istio releases, see release notes .","title":"1. Install Knative Serving"},{"location":"admin/serverless/serverless/#2-install-networking-layer","text":"The recommended networking layer for KServe is Istio as currently it works best with KServe, please refer to the Istio install guide . Alternatively you can also choose other networking layers like Kourier or Contour , see how to install Kourier with KServe guide .","title":"2. Install Networking Layer"},{"location":"admin/serverless/serverless/#3-install-cert-manager","text":"The minimally required Cert Manager version is 1.15.0 and you can refer to Cert Manager . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script.","title":"3. Install Cert Manager"},{"location":"admin/serverless/serverless/#4-install-kserve","text":"kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml","title":"4. Install KServe"},{"location":"admin/serverless/serverless/#5-install-kserve-built-in-clusterservingruntimes","text":"0.13.0 kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml Note ClusterServingRuntimes are required to create InferenceService for built-in model serving runtimes with KServe v0.8.0 or higher.","title":"5. Install KServe Built-in ClusterServingRuntimes"},{"location":"admin/serverless/kourier_networking/","text":"Deploy InferenceService with Alternative Networking Layer \u00b6 KServe creates the top level Istio Virtual Service for routing to InferenceService components based on the virtual host or path based routing. Now KServe provides an option for disabling the top level virtual service to allow configuring other networking layers Knative supports. For example, Kourier is an alternative networking layer and the following steps show how you can deploy KServe with Kourier . Install Kourier Networking Layer \u00b6 Please refer to the Serverless Installation Guide and change the second step to install Kourier instead of Istio . Install the Kourier networking layer: kubectl apply -f https://github.com/knative/net-kourier/releases/download/ ${ KNATIVE_VERSION } /kourier.yaml Configure Knative Serving to use Kourier: kubectl patch configmap/config-network \\ --namespace knative-serving \\ --type merge \\ --patch '{\"data\":{\"ingress-class\":\"kourier.ingress.networking.knative.dev\"}}' Verify Kourier installation: kubectl get pods -n knative-serving && kubectl get pods -n kourier-system Expected Output NAME READY STATUS RESTARTS AGE activator-77db7d9dd7-kbrgr 1 /1 Running 0 10m autoscaler-67dbf79b95-htnp9 1 /1 Running 0 10m controller-684b6bc97f-ffm58 1 /1 Running 0 10m domain-mapping-6d99d99978-ktmrf 1 /1 Running 0 10m domainmapping-webhook-5f998498b6-sddnm 1 /1 Running 0 10m net-kourier-controller-68967d76dc-ncj2n 1 /1 Running 0 10m webhook-97bdc7b4d-nr7qf 1 /1 Running 0 10m NAME READY STATUS RESTARTS AGE 3scale-kourier-gateway-54c49c8ff5-x8tgn 1 /1 Running 0 10m Edit inferenceservice-config configmap to disable Istio top level virtual host: kubectl edit configmap/inferenceservice-config --namespace kserve # Add the flag `\"disableIstioVirtualHost\": true` under the ingress section ingress : | - { \"disableIstioVirtualHost\" : true } Restart the KServe Controller kubectl rollout restart deployment kserve-controller-manager -n kserve Deploy InferenceService for Testing Kourier Gateway \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Run a Prediction \u00b6 Note that when setting INGRESS_HOST and INGRESS_PORT following the determining the ingress IP and ports guide you need to replace istio-ingressgateway with kourier-gateway . For example if you choose to do Port Forward for testing you need to select the kourier-gateway pod as following. kubectl port-forward --namespace kourier-system \\ $( kubectl get pod -n kourier-system -l \"app=3scale-kourier-gateway\" --output = jsonpath = \"{.items[0].metadata.name}\" ) 8080 :8080 export INGRESS_HOST = localhost export INGRESS_PORT = 8080 Make sure that you create a file named pmml-input.json with the following content, under your current terminal path. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } Send a prediction request to the InferenceService and check the output. MODEL_NAME = pmml-demo INPUT_PATH = @./pmml-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo-predictor-default.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 144 < content-type: application/json ; charset = UTF-8 < date: Wed, 14 Sep 2022 13 :30:09 GMT < server: envoy < x-envoy-upstream-service-time: 58 < * Connection #0 to host localhost left intact { \"predictions\" : [{ \"Species\" : \"setosa\" , \"Probability_setosa\" : 1 .0, \"Probability_versicolor\" : 0 .0, \"Probability_virginica\" : 0 .0, \"Node_Id\" : \"2\" }]}","title":"Kourier Networking Layer"},{"location":"admin/serverless/kourier_networking/#deploy-inferenceservice-with-alternative-networking-layer","text":"KServe creates the top level Istio Virtual Service for routing to InferenceService components based on the virtual host or path based routing. Now KServe provides an option for disabling the top level virtual service to allow configuring other networking layers Knative supports. For example, Kourier is an alternative networking layer and the following steps show how you can deploy KServe with Kourier .","title":"Deploy InferenceService with Alternative Networking Layer"},{"location":"admin/serverless/kourier_networking/#install-kourier-networking-layer","text":"Please refer to the Serverless Installation Guide and change the second step to install Kourier instead of Istio . Install the Kourier networking layer: kubectl apply -f https://github.com/knative/net-kourier/releases/download/ ${ KNATIVE_VERSION } /kourier.yaml Configure Knative Serving to use Kourier: kubectl patch configmap/config-network \\ --namespace knative-serving \\ --type merge \\ --patch '{\"data\":{\"ingress-class\":\"kourier.ingress.networking.knative.dev\"}}' Verify Kourier installation: kubectl get pods -n knative-serving && kubectl get pods -n kourier-system Expected Output NAME READY STATUS RESTARTS AGE activator-77db7d9dd7-kbrgr 1 /1 Running 0 10m autoscaler-67dbf79b95-htnp9 1 /1 Running 0 10m controller-684b6bc97f-ffm58 1 /1 Running 0 10m domain-mapping-6d99d99978-ktmrf 1 /1 Running 0 10m domainmapping-webhook-5f998498b6-sddnm 1 /1 Running 0 10m net-kourier-controller-68967d76dc-ncj2n 1 /1 Running 0 10m webhook-97bdc7b4d-nr7qf 1 /1 Running 0 10m NAME READY STATUS RESTARTS AGE 3scale-kourier-gateway-54c49c8ff5-x8tgn 1 /1 Running 0 10m Edit inferenceservice-config configmap to disable Istio top level virtual host: kubectl edit configmap/inferenceservice-config --namespace kserve # Add the flag `\"disableIstioVirtualHost\": true` under the ingress section ingress : | - { \"disableIstioVirtualHost\" : true } Restart the KServe Controller kubectl rollout restart deployment kserve-controller-manager -n kserve","title":"Install Kourier Networking Layer"},{"location":"admin/serverless/kourier_networking/#deploy-inferenceservice-for-testing-kourier-gateway","text":"","title":"Deploy InferenceService for Testing Kourier Gateway"},{"location":"admin/serverless/kourier_networking/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created","title":"Create the InferenceService"},{"location":"admin/serverless/kourier_networking/#run-a-prediction","text":"Note that when setting INGRESS_HOST and INGRESS_PORT following the determining the ingress IP and ports guide you need to replace istio-ingressgateway with kourier-gateway . For example if you choose to do Port Forward for testing you need to select the kourier-gateway pod as following. kubectl port-forward --namespace kourier-system \\ $( kubectl get pod -n kourier-system -l \"app=3scale-kourier-gateway\" --output = jsonpath = \"{.items[0].metadata.name}\" ) 8080 :8080 export INGRESS_HOST = localhost export INGRESS_PORT = 8080 Make sure that you create a file named pmml-input.json with the following content, under your current terminal path. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } Send a prediction request to the InferenceService and check the output. MODEL_NAME = pmml-demo INPUT_PATH = @./pmml-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo-predictor-default.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 144 < content-type: application/json ; charset = UTF-8 < date: Wed, 14 Sep 2022 13 :30:09 GMT < server: envoy < x-envoy-upstream-service-time: 58 < * Connection #0 to host localhost left intact { \"predictions\" : [{ \"Species\" : \"setosa\" , \"Probability_setosa\" : 1 .0, \"Probability_versicolor\" : 0 .0, \"Probability_virginica\" : 0 .0, \"Node_Id\" : \"2\" }]}","title":"Run a Prediction"},{"location":"admin/serverless/servicemesh/","text":"Macro Syntax Error \u00b6 File : admin/serverless/servicemesh/README.md Line 68 in Markdown file: Missing end of comment tag ### Disable Top Level Virtual Service {#disable-top-level-vs}","title":"Istio Service Mesh"},{"location":"admin/serverless/servicemesh/#macro-syntax-error","text":"File : admin/serverless/servicemesh/README.md Line 68 in Markdown file: Missing end of comment tag ### Disable Top Level Virtual Service {#disable-top-level-vs}","title":"Macro Syntax Error"},{"location":"api/api/","text":"KServe API \u00b6","title":"KServe API"},{"location":"api/api/#kserve-api","text":"","title":"KServe API"},{"location":"blog/_index/","text":"","title":" index"},{"location":"blog/articles/2021-09-27-kfserving-transition/","text":"Authors \u00b6 Dan Sun and Animesh Singh on behalf of the Kubeflow Serving Working Group KFServing is now KServe \u00b6 We are excited to announce the next chapter for KFServing. In coordination with the Kubeflow Project Steering Group, the KFServing GitHub repository has now been transferred to an independent KServe GitHub organization under the stewardship of the Kubeflow Serving Working Group leads. The project has been rebranded from KFServing to KServe , and we are planning to graduate the project from Kubeflow Project later this year. Developed collaboratively by Google, IBM, Bloomberg, NVIDIA, and Seldon in 2019, KFServing was published as open source in early 2019. The project sets out to provide the following features: - A simple, yet powerful, Kubernetes Custom Resource for deploying machine learning (ML) models on production across ML frameworks. - Provide performant, standardized inference protocol. - Serverless inference according to live traffic patterns, supporting \u201cScale-to-zero\u201d on both CPUs and GPUs. - Complete story for production ML Model Serving including prediction, pre/post-processing, explainability, and monitoring. - Support for deploying thousands of models at scale and inference graph capability for multiple models. KFServing was created to address the challenges of deploying and monitoring machine learning models on production for organizations. After publishing the open source project, we\u2019ve seen an explosion in demand for the software, leading to strong adoption and community growth. The scope of the project has since increased, and we have developed multiple components along the way, including our own growing body of documentation that needs it's own website and independent GitHub organization. What's Next \u00b6 Over the coming weeks, we will be releasing KServe 0.7 outside of the Kubeflow Project and will provide more details on how to migrate from KFServing to KServe with minimal disruptions. KFServing 0.5.x/0.6.x releases are still supported in next six months after KServe 0.7 release. We are also working on integrating core Kubeflow APIs and standards for the conformance program . For contributors, please follow the KServe developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users! KServe Key Links \u00b6 Website Github Slack(#kubeflow-kfserving) Contributor Acknowledgement \u00b6 We'd like to thank all the KServe contributors for this transition work! Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"KFserving Transition"},{"location":"blog/articles/2021-09-27-kfserving-transition/#authors","text":"Dan Sun and Animesh Singh on behalf of the Kubeflow Serving Working Group","title":"Authors"},{"location":"blog/articles/2021-09-27-kfserving-transition/#kfserving-is-now-kserve","text":"We are excited to announce the next chapter for KFServing. In coordination with the Kubeflow Project Steering Group, the KFServing GitHub repository has now been transferred to an independent KServe GitHub organization under the stewardship of the Kubeflow Serving Working Group leads. The project has been rebranded from KFServing to KServe , and we are planning to graduate the project from Kubeflow Project later this year. Developed collaboratively by Google, IBM, Bloomberg, NVIDIA, and Seldon in 2019, KFServing was published as open source in early 2019. The project sets out to provide the following features: - A simple, yet powerful, Kubernetes Custom Resource for deploying machine learning (ML) models on production across ML frameworks. - Provide performant, standardized inference protocol. - Serverless inference according to live traffic patterns, supporting \u201cScale-to-zero\u201d on both CPUs and GPUs. - Complete story for production ML Model Serving including prediction, pre/post-processing, explainability, and monitoring. - Support for deploying thousands of models at scale and inference graph capability for multiple models. KFServing was created to address the challenges of deploying and monitoring machine learning models on production for organizations. After publishing the open source project, we\u2019ve seen an explosion in demand for the software, leading to strong adoption and community growth. The scope of the project has since increased, and we have developed multiple components along the way, including our own growing body of documentation that needs it's own website and independent GitHub organization.","title":"KFServing is now KServe"},{"location":"blog/articles/2021-09-27-kfserving-transition/#whats-next","text":"Over the coming weeks, we will be releasing KServe 0.7 outside of the Kubeflow Project and will provide more details on how to migrate from KFServing to KServe with minimal disruptions. KFServing 0.5.x/0.6.x releases are still supported in next six months after KServe 0.7 release. We are also working on integrating core Kubeflow APIs and standards for the conformance program . For contributors, please follow the KServe developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users!","title":"What's Next"},{"location":"blog/articles/2021-09-27-kfserving-transition/#kserve-key-links","text":"Website Github Slack(#kubeflow-kfserving)","title":"KServe Key Links"},{"location":"blog/articles/2021-09-27-kfserving-transition/#contributor-acknowledgement","text":"We'd like to thank all the KServe contributors for this transition work! Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"Contributor Acknowledgement"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/","text":"Authors \u00b6 Dan Sun , Animesh Singh , Yuzhui Liu , Vedant Padwal on behalf of the KServe Working Group. KFServing is now KServe and KServe 0.7 release is available, the release also ensures a smooth user migration experience from KFServing to KServe. What's Changed? \u00b6 InferenceService API group is changed from serving.kubeflow.org to serving.kserve.io #1826 , the migration job is created for smooth transition. Python SDK name is changed from kfserving to kserve . KServe Installation manifests #1824 . Models-web-app is separated out of the kserve repository to models-web-app . Docs and examples are moved to separate repository website . KServe images are migrated to kserve docker hub account. v1alpha2 API group is deprecated #1850 . \ud83c\udf08 What's New? \u00b6 ModelMesh project is joining KServe under repository modelmesh-serving ! ModelMesh is designed for high-scale, high-density and frequently-changing model use cases. ModelMesh intelligently loads and unloads AI models to and from memory to strike an intelligent trade-off between responsiveness to users and computational footprint. To learn more about ModelMesh features and components, check out the ModelMesh announcement blog and Join talk at #KubeCon NA to get a deeper dive into ModelMesh and KServe . (Alpha feature) Raw Kubernetes deployment support, Istio/Knative dependency is now optional and please follow the guide to install and turn on RawDeployment mode. KServe now has its own documentation website temporarily hosted on website . Support v1 crd and webhook configuration for Kubernetes 1.22 #1837 . Triton model serving runtime now defaults to 21.09 version #1840 . \ud83d\udc1e What's Fixed? \u00b6 Bug fix for Azure blob storage #1845 . Tar/Zip support for all storage options #1836 . Fix AWS_REGION env variable and add AWS_CA_BUNDLE for S3 #1780 . Torchserve custom package install fix #1619 . Join the community \u00b6 Visit our Website or GitHub Join the Slack(#kubeflow-kfserving) Attend a Biweekly community meeting on Wednesday 9am PST Contribute at developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users! Contributors \u00b6 We would like to thank everyone for their efforts on v0.7 Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"KServe 0.7 Release"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#authors","text":"Dan Sun , Animesh Singh , Yuzhui Liu , Vedant Padwal on behalf of the KServe Working Group. KFServing is now KServe and KServe 0.7 release is available, the release also ensures a smooth user migration experience from KFServing to KServe.","title":"Authors"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-changed","text":"InferenceService API group is changed from serving.kubeflow.org to serving.kserve.io #1826 , the migration job is created for smooth transition. Python SDK name is changed from kfserving to kserve . KServe Installation manifests #1824 . Models-web-app is separated out of the kserve repository to models-web-app . Docs and examples are moved to separate repository website . KServe images are migrated to kserve docker hub account. v1alpha2 API group is deprecated #1850 .","title":"What's Changed?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-new","text":"ModelMesh project is joining KServe under repository modelmesh-serving ! ModelMesh is designed for high-scale, high-density and frequently-changing model use cases. ModelMesh intelligently loads and unloads AI models to and from memory to strike an intelligent trade-off between responsiveness to users and computational footprint. To learn more about ModelMesh features and components, check out the ModelMesh announcement blog and Join talk at #KubeCon NA to get a deeper dive into ModelMesh and KServe . (Alpha feature) Raw Kubernetes deployment support, Istio/Knative dependency is now optional and please follow the guide to install and turn on RawDeployment mode. KServe now has its own documentation website temporarily hosted on website . Support v1 crd and webhook configuration for Kubernetes 1.22 #1837 . Triton model serving runtime now defaults to 21.09 version #1840 .","title":"\ud83c\udf08 What's New?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-fixed","text":"Bug fix for Azure blob storage #1845 . Tar/Zip support for all storage options #1836 . Fix AWS_REGION env variable and add AWS_CA_BUNDLE for S3 #1780 . Torchserve custom package install fix #1619 .","title":"\ud83d\udc1e What's Fixed?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack(#kubeflow-kfserving) Attend a Biweekly community meeting on Wednesday 9am PST Contribute at developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users!","title":"Join the community"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#contributors","text":"We would like to thank everyone for their efforts on v0.7 Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"Contributors"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/","text":"Macro Syntax Error \u00b6 File : blog/articles/2022-02-18-KServe-0.8-release.md Line 67 in Markdown file: unexpected '.' - --model_name={{.Name}}","title":"KServe 0.8 Release"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#macro-syntax-error","text":"File : blog/articles/2022-02-18-KServe-0.8-release.md Line 67 in Markdown file: unexpected '.' - --model_name={{.Name}}","title":"Macro Syntax Error"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/","text":"Announcing: KServe v0.9.0 \u00b6 Today, we are pleased to announce the v0.9.0 release of KServe! KServe has now fully onboarded to LF AI & Data Foundation as an Incubation Project ! In this release we are excited to introduce the new InferenceGraph feature which has long been asked from the community. Also continuing the effort from the last release for unifying the InferenceService API for deploying models on KServe and ModelMesh, ModelMesh is now fully compatible with KServe InferenceService API! Introduce InferenceGraph \u00b6 The ML Inference system is getting bigger and more complex. It often consists of many models to make a single prediction. The common use cases are image classification and natural language multi-stage processing pipelines. For example, an image classification pipeline needs to run top level classification first then downstream further classification based on previous prediction results. KServe has the unique strength to build the distributed inference graph with its native integration of InferenceServices, standard inference protocol for chaining models and serverless auto-scaling capabilities. KServe leverages these strengths to build the InferenceGraph and enable users to deploy complex ML Inference pipelines to production in a declarative and scalable way. InferenceGraph is made up of a list of routing nodes with each node consisting of a set of routing steps. Each step can either route to an InferenceService or another node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of routing nodes: Sequence , Switch , Ensemble , Splitter . Sequence Node : It allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step and be passed to the next step as input based on configuration. Switch Node : It allows users to define routing conditions and select a Step to execute if it matches the condition. The response is returned as soon as it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combines the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : It allows users to split the traffic to multiple targets using a weighted distribution. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cat-dog-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"dog-breed-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/dog_breed_classification --- apiVersion : \"serving.kserve.io/v1alpha1\" kind : \"InferenceGraph\" metadata : name : \"dog-breed-pipeline\" spec : nodes : root : routerType : Sequence steps : - serviceName : cat-dog-classifier name : cat_dog_classifier # step name - serviceName : dog-breed-classifier name : dog_breed_classifier data : $request condition : \"[@this].#(predictions.0==\\\"dog\\\")\" Currently InferenceGraph is supported with the Serverless deployment mode. You can try it out following the tutorial . InferenceService API for ModelMesh \u00b6 The InferenceService CRD is now the primary interface for interacting with ModelMesh. Some changes were made to the InferenceService spec to better facilitate ModelMesh\u2019s needs. Storage Spec \u00b6 To unify how model storage is defined for both single and multi-model serving, a new storage spec was added to the predictor model spec. With this storage spec, users can specify a key inside a common secret holding config/credentials for each of the storage backends from which models can be loaded. Example: storage : key : localMinIO # Credential key for the destination storage in the common secret path : sklearn # Model path inside the bucket # schemaPath: null # Optional schema files for payload schema parameters : # Parameters to override the default values inside the common secret. bucket : example-models Learn more here . Model Status \u00b6 For further alignment between ModelMesh and KServe, some additions to the InferenceService status were made. There is now a Model Status section which contains information about the model loaded in the predictor. New fields include: states - State information of the predictor's model. activeModelState - The state of the model currently being served by the predictor's endpoints. targetModelState - This will be set only when transitionStatus is not UpToDate , meaning that the target model differs from the currently-active model. transitionStatus - Indicates state of the predictor relative to its current spec. modelCopies - Model copy information of the predictor's model. lastFailureInfo - Details about the most recent error associated with this predictor. Not all of the contained fields will necessarily have a value. Deploying on ModelMesh \u00b6 For deploying InferenceServices on ModelMesh, the ModelMesh and KServe controllers will still require that the user specifies the serving.kserve.io/deploymentMode: ModelMesh annotation. A complete example on an InferenceService with the new storage spec is showing below: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-tensorflow-mnist annotations : serving.kserve.io/deploymentMode : ModelMesh spec : predictor : model : modelFormat : name : tensorflow storage : key : localMinIO path : tensorflow/mnist.savedmodel Other New Features: \u00b6 Support serving MLFlow model format via MLServer serving runtime. Support unified autoscaling target and metric fields for InferenceService components with both Serverless and RawDeployment mode. Support InferenceService ingress class and url domain template configuration for RawDeployment mode. ModelMesh now has a default OpenVINO Model Server ServingRuntime. What\u2019s Changed? \u00b6 The KServe controller manager is changed from StatefulSet to Deployment to support HA mode. log4j security vulnerability fix Upgrade TorchServe serving runtime to 0.6.0 Update MLServer serving runtime to 1.0.0 Check out the full release notes for KServe and ModelMesh for more details. Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thank you for contributing or checking out KServe! \u2013 The KServe Working Group","title":"KServe 0.9 Release"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#announcing-kserve-v090","text":"Today, we are pleased to announce the v0.9.0 release of KServe! KServe has now fully onboarded to LF AI & Data Foundation as an Incubation Project ! In this release we are excited to introduce the new InferenceGraph feature which has long been asked from the community. Also continuing the effort from the last release for unifying the InferenceService API for deploying models on KServe and ModelMesh, ModelMesh is now fully compatible with KServe InferenceService API!","title":"Announcing: KServe v0.9.0"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#introduce-inferencegraph","text":"The ML Inference system is getting bigger and more complex. It often consists of many models to make a single prediction. The common use cases are image classification and natural language multi-stage processing pipelines. For example, an image classification pipeline needs to run top level classification first then downstream further classification based on previous prediction results. KServe has the unique strength to build the distributed inference graph with its native integration of InferenceServices, standard inference protocol for chaining models and serverless auto-scaling capabilities. KServe leverages these strengths to build the InferenceGraph and enable users to deploy complex ML Inference pipelines to production in a declarative and scalable way. InferenceGraph is made up of a list of routing nodes with each node consisting of a set of routing steps. Each step can either route to an InferenceService or another node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of routing nodes: Sequence , Switch , Ensemble , Splitter . Sequence Node : It allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step and be passed to the next step as input based on configuration. Switch Node : It allows users to define routing conditions and select a Step to execute if it matches the condition. The response is returned as soon as it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combines the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : It allows users to split the traffic to multiple targets using a weighted distribution. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cat-dog-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"dog-breed-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/dog_breed_classification --- apiVersion : \"serving.kserve.io/v1alpha1\" kind : \"InferenceGraph\" metadata : name : \"dog-breed-pipeline\" spec : nodes : root : routerType : Sequence steps : - serviceName : cat-dog-classifier name : cat_dog_classifier # step name - serviceName : dog-breed-classifier name : dog_breed_classifier data : $request condition : \"[@this].#(predictions.0==\\\"dog\\\")\" Currently InferenceGraph is supported with the Serverless deployment mode. You can try it out following the tutorial .","title":"Introduce InferenceGraph"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#inferenceservice-api-for-modelmesh","text":"The InferenceService CRD is now the primary interface for interacting with ModelMesh. Some changes were made to the InferenceService spec to better facilitate ModelMesh\u2019s needs.","title":"InferenceService API for ModelMesh"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#storage-spec","text":"To unify how model storage is defined for both single and multi-model serving, a new storage spec was added to the predictor model spec. With this storage spec, users can specify a key inside a common secret holding config/credentials for each of the storage backends from which models can be loaded. Example: storage : key : localMinIO # Credential key for the destination storage in the common secret path : sklearn # Model path inside the bucket # schemaPath: null # Optional schema files for payload schema parameters : # Parameters to override the default values inside the common secret. bucket : example-models Learn more here .","title":"Storage Spec"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#model-status","text":"For further alignment between ModelMesh and KServe, some additions to the InferenceService status were made. There is now a Model Status section which contains information about the model loaded in the predictor. New fields include: states - State information of the predictor's model. activeModelState - The state of the model currently being served by the predictor's endpoints. targetModelState - This will be set only when transitionStatus is not UpToDate , meaning that the target model differs from the currently-active model. transitionStatus - Indicates state of the predictor relative to its current spec. modelCopies - Model copy information of the predictor's model. lastFailureInfo - Details about the most recent error associated with this predictor. Not all of the contained fields will necessarily have a value.","title":"Model Status"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#deploying-on-modelmesh","text":"For deploying InferenceServices on ModelMesh, the ModelMesh and KServe controllers will still require that the user specifies the serving.kserve.io/deploymentMode: ModelMesh annotation. A complete example on an InferenceService with the new storage spec is showing below: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-tensorflow-mnist annotations : serving.kserve.io/deploymentMode : ModelMesh spec : predictor : model : modelFormat : name : tensorflow storage : key : localMinIO path : tensorflow/mnist.savedmodel","title":"Deploying on ModelMesh"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#other-new-features","text":"Support serving MLFlow model format via MLServer serving runtime. Support unified autoscaling target and metric fields for InferenceService components with both Serverless and RawDeployment mode. Support InferenceService ingress class and url domain template configuration for RawDeployment mode. ModelMesh now has a default OpenVINO Model Server ServingRuntime.","title":"Other New Features:"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#whats-changed","text":"The KServe controller manager is changed from StatefulSet to Deployment to support HA mode. log4j security vulnerability fix Upgrade TorchServe serving runtime to 0.6.0 Update MLServer serving runtime to 1.0.0 Check out the full release notes for KServe and ModelMesh for more details.","title":"What\u2019s Changed?"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thank you for contributing or checking out KServe! \u2013 The KServe Working Group","title":"Join the community"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/","text":"Announcing: KServe v0.10.0 \u00b6 We are excited to announce KServe 0.10 release. In this release we have enabled more KServe networking options, improved KServe telemetry for supported serving runtimes and increased support coverage for Open(aka v2) inference protocol for both standard and ModelMesh InferenceService. KServe Networking Options \u00b6 Istio is now optional for both Serverless and RawDeployment mode. Please see the alternative networking guide for how you can enable other ingress options supported by Knative with Serverless mode. For Istio users, if you want to turn on full service mesh mode to secure InferenceService with mutual TLS and enable the traffic policies, please read the service mesh setup guideline . KServe Telemetry for Serving Runtimes \u00b6 We have instrumented additional latency metrics in KServe Python ServingRuntimes for preprocess , predict and postprocess handlers. In Serverless mode we have extended Knative queue-proxy to enable metrics aggregation for both metrics exposed in queue-proxy and kserve-container from each ServingRuntime . Please read the prometheus metrics setup guideline for how to enable the metrics scraping and aggregations. Open(v2) Inference Protocol Support Coverage \u00b6 As there have been increasing adoptions for KServe v2 Inference Protocol from AMD Inference ServingRuntime which supports FPGAs and OpenVINO which now provides KServe REST and gRPC compatible API, in the issue we have proposed to rename to KServe Open Inference Protocol . In KServe 0.10, we have added Open(v2) inference protocol support for KServe custom runtimes. Now, you can enable v2 REST/gRPC for both custom transformer and predictor with images built by implementing KServe Python SDK API. gRPC enables high performance inference data plane as it is built on top of HTTP/2 and binary data transportation which is more efficient to send over the wire compared to REST. Please see the detailed example for transformer and predictor . from kserve import Model def image_transform ( byte_array ): image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor class CustomModel ( Model ): def predict ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferResponse : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) output = self . model ( input_tensors ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () infer_output = InferOutput ( name = \"output-0\" , shape = list ( values . shape ), datatype = \"FP32\" , data = result ) infer_response = InferResponse ( model_name = self . name , infer_outputs = [ infer_output ], response_id = response_id ) return infer_response class CustomTransformer ( Model ): def preprocess ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request You can use the same Python API type InferRequest and InferResponse for both REST and gRPC protocol. KServe handles the underlying decoding and encoding according to the protocol. Warning A new headers argument is added to the custom handlers to pass http/gRPC headers or other metadata. You can also use this as context dict to pass data between handlers. If you have existing custom transformer or predictor, the headers argument is now required to add to the preprocess , predict and postprocess handlers. Please check the following matrix for supported ModelFormats and ServingRuntimes . Model Format v1 Open(v2) REST/gRPC Tensorflow \u2705 TFServing \u2705 Triton PyTorch \u2705 TorchServe \u2705 TorchServe TorchScript \u2705 TorchServe \u2705 Triton ONNX \u274c \u2705 Triton Scikit-learn \u2705 KServe \u2705 MLServer XGBoost \u2705 KServe \u2705 MLServer LightGBM \u2705 KServe \u2705 MLServer MLFlow \u274c \u2705 MLServer Custom \u2705 KServe \u2705 KServe Multi-Arch Image Support \u00b6 KServe control plane images kserve-controller , kserve/agent , kserve/router are now supported for multiple architectures: ppc64le , arm64 , amd64 , s390x . KServe Storage Credentials Support \u00b6 Currently, AWS users need to create a secret with long term/static IAM credentials for downloading models stored in S3. Security best practice is to use IAM role for service account(IRSA) which enables automatic credential rotation and fine-grained access control, see how to setup IRSA . Support Azure Blobs with managed identity . ModelMesh updates \u00b6 ModelMesh has continued to integrate itself as KServe's multi-model serving backend, introducing improvements and features that better align the two projects. For example, it now supports ClusterServingRuntimes, allowing use of cluster-scoped ServingRuntimes, originally introduced in KServe 0.8. Additionally, ModelMesh introduced support for TorchServe enabling users to serve arbitrary PyTorch models (e.g. eager-mode) in the context of distributed-multi-model serving. Other limitations have been addressed as well, such as adding support for BYTES/string type tensors when using the REST inference API for inference requests that require them. Other Changes: \u00b6 For a complete change list please read the release notes from KServe v0.10 and ModelMesh v0.10 . Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.10 release! Steve Larkin Stephan Schielke Curtis Maddalozzo Zhongcheng Lao Dimitris Aragiorgis Pan Li tjandy98 Sukumar Gaonkar Rachit Chauhan Rafael Vasquez Tim Kleinloog Christian Kadner ddelange Lize Cai sangjune.park Suresh Nakkeran Konstantinos Messis Matt Rose Alexa Griffith Jagadeesh J Alex Lembiyeuski Yuki Iwai Andrews Arokiam Xin Fu adilhusain-s Pranav Pandit C1berwiz dilverse Yuan Tang Dan Sun Nick Hill The KServe Working Group","title":"KServe 0.10 Release"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#announcing-kserve-v0100","text":"We are excited to announce KServe 0.10 release. In this release we have enabled more KServe networking options, improved KServe telemetry for supported serving runtimes and increased support coverage for Open(aka v2) inference protocol for both standard and ModelMesh InferenceService.","title":"Announcing: KServe v0.10.0"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-networking-options","text":"Istio is now optional for both Serverless and RawDeployment mode. Please see the alternative networking guide for how you can enable other ingress options supported by Knative with Serverless mode. For Istio users, if you want to turn on full service mesh mode to secure InferenceService with mutual TLS and enable the traffic policies, please read the service mesh setup guideline .","title":"KServe Networking Options"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-telemetry-for-serving-runtimes","text":"We have instrumented additional latency metrics in KServe Python ServingRuntimes for preprocess , predict and postprocess handlers. In Serverless mode we have extended Knative queue-proxy to enable metrics aggregation for both metrics exposed in queue-proxy and kserve-container from each ServingRuntime . Please read the prometheus metrics setup guideline for how to enable the metrics scraping and aggregations.","title":"KServe Telemetry for Serving Runtimes"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#openv2-inference-protocol-support-coverage","text":"As there have been increasing adoptions for KServe v2 Inference Protocol from AMD Inference ServingRuntime which supports FPGAs and OpenVINO which now provides KServe REST and gRPC compatible API, in the issue we have proposed to rename to KServe Open Inference Protocol . In KServe 0.10, we have added Open(v2) inference protocol support for KServe custom runtimes. Now, you can enable v2 REST/gRPC for both custom transformer and predictor with images built by implementing KServe Python SDK API. gRPC enables high performance inference data plane as it is built on top of HTTP/2 and binary data transportation which is more efficient to send over the wire compared to REST. Please see the detailed example for transformer and predictor . from kserve import Model def image_transform ( byte_array ): image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor class CustomModel ( Model ): def predict ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferResponse : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) output = self . model ( input_tensors ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () infer_output = InferOutput ( name = \"output-0\" , shape = list ( values . shape ), datatype = \"FP32\" , data = result ) infer_response = InferResponse ( model_name = self . name , infer_outputs = [ infer_output ], response_id = response_id ) return infer_response class CustomTransformer ( Model ): def preprocess ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request You can use the same Python API type InferRequest and InferResponse for both REST and gRPC protocol. KServe handles the underlying decoding and encoding according to the protocol. Warning A new headers argument is added to the custom handlers to pass http/gRPC headers or other metadata. You can also use this as context dict to pass data between handlers. If you have existing custom transformer or predictor, the headers argument is now required to add to the preprocess , predict and postprocess handlers. Please check the following matrix for supported ModelFormats and ServingRuntimes . Model Format v1 Open(v2) REST/gRPC Tensorflow \u2705 TFServing \u2705 Triton PyTorch \u2705 TorchServe \u2705 TorchServe TorchScript \u2705 TorchServe \u2705 Triton ONNX \u274c \u2705 Triton Scikit-learn \u2705 KServe \u2705 MLServer XGBoost \u2705 KServe \u2705 MLServer LightGBM \u2705 KServe \u2705 MLServer MLFlow \u274c \u2705 MLServer Custom \u2705 KServe \u2705 KServe","title":"Open(v2) Inference Protocol Support Coverage"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#multi-arch-image-support","text":"KServe control plane images kserve-controller , kserve/agent , kserve/router are now supported for multiple architectures: ppc64le , arm64 , amd64 , s390x .","title":"Multi-Arch Image Support"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-storage-credentials-support","text":"Currently, AWS users need to create a secret with long term/static IAM credentials for downloading models stored in S3. Security best practice is to use IAM role for service account(IRSA) which enables automatic credential rotation and fine-grained access control, see how to setup IRSA . Support Azure Blobs with managed identity .","title":"KServe Storage Credentials Support"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#modelmesh-updates","text":"ModelMesh has continued to integrate itself as KServe's multi-model serving backend, introducing improvements and features that better align the two projects. For example, it now supports ClusterServingRuntimes, allowing use of cluster-scoped ServingRuntimes, originally introduced in KServe 0.8. Additionally, ModelMesh introduced support for TorchServe enabling users to serve arbitrary PyTorch models (e.g. eager-mode) in the context of distributed-multi-model serving. Other limitations have been addressed as well, such as adding support for BYTES/string type tensors when using the REST inference API for inference requests that require them.","title":"ModelMesh updates"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#other-changes","text":"For a complete change list please read the release notes from KServe v0.10 and ModelMesh v0.10 .","title":"Other Changes:"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.10 release! Steve Larkin Stephan Schielke Curtis Maddalozzo Zhongcheng Lao Dimitris Aragiorgis Pan Li tjandy98 Sukumar Gaonkar Rachit Chauhan Rafael Vasquez Tim Kleinloog Christian Kadner ddelange Lize Cai sangjune.park Suresh Nakkeran Konstantinos Messis Matt Rose Alexa Griffith Jagadeesh J Alex Lembiyeuski Yuki Iwai Andrews Arokiam Xin Fu adilhusain-s Pranav Pandit C1berwiz dilverse Yuan Tang Dan Sun Nick Hill The KServe Working Group","title":"Join the community"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/","text":"Announcing: KServe v0.11 \u00b6 We are excited to announce the release of KServe 0.11, in this release we introduced Large Language Model (LLM) runtimes, made enhancements to the KServe control plane, Python SDK Open Inference Protocol support and dependency managemenet. For ModelMesh we have added features PVC, HPA, payload logging to ensure feature parity with KServe. Here is a summary of the key changes: KServe Core Inference Enhancements \u00b6 Support path based routing which is served as an alternative way to the host based routing, the URL of the InferenceService could look like http://<ingress_domain>/serving/<namespace>/<isvc_name> . Please refer to the doc for how to enable path based routing. Introduced priority field for Serving Runtime custom resource to handle the case when you have multiple serving runtimes which support the same model formats, see more details from the serving runtime doc . Introduced Custom Storage Container CRD to allow customized implementations with supported storage URI prefixes, example use cases are private model registry integration: apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/model-registry:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : model-registry:// Inference Graph enhancements for improving the API spec to support pod affinity and resource requirement fields. Dependency field with options Soft and Hard is introduced to handle error responses from the inference steps to decide whether to short-circuit the request in case of errors, see the following example with hard dependency with the node steps: apiVersion : serving.kserve.io/v1alpha1 kind : InferenceGraph metadata : name : graph_with_switch_node spec : nodes : root : routerType : Sequence steps : - name : \"rootStep1\" nodeName : node1 dependency : Hard - name : \"rootStep2\" serviceName : {{ success_200_isvc_id }} node1 : routerType : Switch steps : - name : \"node1Step1\" serviceName : {{ error_404_isvc_id }} condition : \"[@this].#(decision_picker==ERROR)\" dependency : Hard For more details please refer to the issue . Improved InferenceService debugging experience by adding the aggregated RoutesReady status and LastDeploymentReady condition to the InferenceService Status to differentiate the endpoint and deployment status. This applies to the serverless mode and for more details refer to the API docs . Enhanced Python SDK Dependency Management \u00b6 KServe has adopted poetry to manage python dependencies. You can now install the KServe SDK with locked dependencies using poetry install . While pip install still works, we highly recommend using poetry to ensure predictable dependency management. The KServe SDK is also slimmed down by making the cloud storage dependency optional, if you require storage dependency for custom serving runtimes you can still install with pip install kserve[storage] . KServe Python Runtimes Improvements \u00b6 KServe Python Runtimes including sklearnserver , lgbserver , xgbserver now support the open inference protocol for both REST and gRPC. Logging improvements including adding Uvicorn access logging and a default KServe logger. Postprocess handler has been aligned with open inference protocol, simplifying the underlying transportation protocol complexities. LLM Runtimes \u00b6 TorchServe LLM Runtime \u00b6 KServe now integrates with TorchServe 0.8, offering the support for LLM models that may not fit onto a single GPU. Huggingface Accelerate and Deepspeed are available options to split the model into multiple partitions over multiple GPUs. You can see the detailed example for how to serve the LLM on KServe with TorchServe runtime. vLLM Runtime \u00b6 Serving LLM models can be surprisingly slow even on high end GPUs, vLLM is a fast and easy-to-use LLM inference engine. It can achieve 10x-20x higher throughput than Huggingface transformers. It supports continuous batching for increased throughput and GPU utilization, paged attention to address the memory bottleneck where in the autoregressive decoding process all the attention key value tensors(KV Cache) are kept in the GPU memory to generate next tokens. In the example we show how to deploy vLLM on KServe and expects further integration in KServe 0.12 with proposed generate endpoint for open inference protocol. ModelMesh Updates \u00b6 Storing Models on Kubernetes Persistent Volumes (PVC) \u00b6 ModelMesh now allows to directly mount model files onto serving runtimes pods using Kubernetes Persistent Volumes . Depending on the selected storage solution this approach can significantly reduce latency when deploying new predictors, potentially remove the need for additional S3 cloud object storage like AWS S3, GCS, or Azure Blob Storage altogether. Horizontal Pod Autoscaling (HPA) \u00b6 Kubernetes Horizontal Pod Autoscaling can now be used at the serving runtime pod level. With HPA enabled, the ModelMesh controller no longer manages the number of replicas. Instead, a HorizontalPodAutoscaler automatically updates the serving runtime deployment with the number of Pods to best match the demand. Model Metrics, Metrics Dashboard, Payload Event Logging \u00b6 ModelMesh v0.11 introduces a new configuration option to emit a subset of useful metrics at the individual model level. These metrics can help identify outlier or \"heavy hitter\" models and consequently fine-tune the deployments of those inference services, like allocating more resources or increasing the number of replicas for improved responsiveness or avoid frequent cache misses. A new Grafana dashboard was added to display the comprehensive set of Prometheus metrics like model loading and unloading rates, internal queuing delays, capacity and usage, cache state, etc. to monitor the general health of the ModelMesh Serving deployment. The new PayloadProcessor interface can be implemented to log prediction requests and responses, to create data sinks for data visualization, for model quality assessment, or for drift and outlier detection by external monitoring systems. What's Changed? \u00b6 To allow longer InferenceService name due to DNS max length limits from issue , the Default suffix in the inference service component(predictor/transformer/explainer) name has been removed for newly created InferenceServices. This affects the client that is using the component url directly instead of the top level InferenceService url. Status.address.url is now consistent for both serverless and raw deployment mode, the url path portion is dropped in serverless mode. Raw bytes are now accepted in v1 protocol, setting the right content-type header to application/json is required to recognize and decode the json payload if content-type is specified. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json For a complete change list please read the release notes from KServe v0.11 and ModelMesh v0.11 . Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.11 release! The KServe Working Group","title":"KServe 0.11 Release"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#announcing-kserve-v011","text":"We are excited to announce the release of KServe 0.11, in this release we introduced Large Language Model (LLM) runtimes, made enhancements to the KServe control plane, Python SDK Open Inference Protocol support and dependency managemenet. For ModelMesh we have added features PVC, HPA, payload logging to ensure feature parity with KServe. Here is a summary of the key changes:","title":"Announcing: KServe v0.11"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#kserve-core-inference-enhancements","text":"Support path based routing which is served as an alternative way to the host based routing, the URL of the InferenceService could look like http://<ingress_domain>/serving/<namespace>/<isvc_name> . Please refer to the doc for how to enable path based routing. Introduced priority field for Serving Runtime custom resource to handle the case when you have multiple serving runtimes which support the same model formats, see more details from the serving runtime doc . Introduced Custom Storage Container CRD to allow customized implementations with supported storage URI prefixes, example use cases are private model registry integration: apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/model-registry:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : model-registry:// Inference Graph enhancements for improving the API spec to support pod affinity and resource requirement fields. Dependency field with options Soft and Hard is introduced to handle error responses from the inference steps to decide whether to short-circuit the request in case of errors, see the following example with hard dependency with the node steps: apiVersion : serving.kserve.io/v1alpha1 kind : InferenceGraph metadata : name : graph_with_switch_node spec : nodes : root : routerType : Sequence steps : - name : \"rootStep1\" nodeName : node1 dependency : Hard - name : \"rootStep2\" serviceName : {{ success_200_isvc_id }} node1 : routerType : Switch steps : - name : \"node1Step1\" serviceName : {{ error_404_isvc_id }} condition : \"[@this].#(decision_picker==ERROR)\" dependency : Hard For more details please refer to the issue . Improved InferenceService debugging experience by adding the aggregated RoutesReady status and LastDeploymentReady condition to the InferenceService Status to differentiate the endpoint and deployment status. This applies to the serverless mode and for more details refer to the API docs .","title":"KServe Core Inference Enhancements"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#enhanced-python-sdk-dependency-management","text":"KServe has adopted poetry to manage python dependencies. You can now install the KServe SDK with locked dependencies using poetry install . While pip install still works, we highly recommend using poetry to ensure predictable dependency management. The KServe SDK is also slimmed down by making the cloud storage dependency optional, if you require storage dependency for custom serving runtimes you can still install with pip install kserve[storage] .","title":"Enhanced Python SDK Dependency Management"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#kserve-python-runtimes-improvements","text":"KServe Python Runtimes including sklearnserver , lgbserver , xgbserver now support the open inference protocol for both REST and gRPC. Logging improvements including adding Uvicorn access logging and a default KServe logger. Postprocess handler has been aligned with open inference protocol, simplifying the underlying transportation protocol complexities.","title":"KServe Python Runtimes Improvements"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#llm-runtimes","text":"","title":"LLM Runtimes"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#torchserve-llm-runtime","text":"KServe now integrates with TorchServe 0.8, offering the support for LLM models that may not fit onto a single GPU. Huggingface Accelerate and Deepspeed are available options to split the model into multiple partitions over multiple GPUs. You can see the detailed example for how to serve the LLM on KServe with TorchServe runtime.","title":"TorchServe LLM Runtime"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#vllm-runtime","text":"Serving LLM models can be surprisingly slow even on high end GPUs, vLLM is a fast and easy-to-use LLM inference engine. It can achieve 10x-20x higher throughput than Huggingface transformers. It supports continuous batching for increased throughput and GPU utilization, paged attention to address the memory bottleneck where in the autoregressive decoding process all the attention key value tensors(KV Cache) are kept in the GPU memory to generate next tokens. In the example we show how to deploy vLLM on KServe and expects further integration in KServe 0.12 with proposed generate endpoint for open inference protocol.","title":"vLLM Runtime"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#modelmesh-updates","text":"","title":"ModelMesh Updates"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#storing-models-on-kubernetes-persistent-volumes-pvc","text":"ModelMesh now allows to directly mount model files onto serving runtimes pods using Kubernetes Persistent Volumes . Depending on the selected storage solution this approach can significantly reduce latency when deploying new predictors, potentially remove the need for additional S3 cloud object storage like AWS S3, GCS, or Azure Blob Storage altogether.","title":"Storing Models on Kubernetes Persistent Volumes (PVC)"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#horizontal-pod-autoscaling-hpa","text":"Kubernetes Horizontal Pod Autoscaling can now be used at the serving runtime pod level. With HPA enabled, the ModelMesh controller no longer manages the number of replicas. Instead, a HorizontalPodAutoscaler automatically updates the serving runtime deployment with the number of Pods to best match the demand.","title":"Horizontal Pod Autoscaling (HPA)"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#model-metrics-metrics-dashboard-payload-event-logging","text":"ModelMesh v0.11 introduces a new configuration option to emit a subset of useful metrics at the individual model level. These metrics can help identify outlier or \"heavy hitter\" models and consequently fine-tune the deployments of those inference services, like allocating more resources or increasing the number of replicas for improved responsiveness or avoid frequent cache misses. A new Grafana dashboard was added to display the comprehensive set of Prometheus metrics like model loading and unloading rates, internal queuing delays, capacity and usage, cache state, etc. to monitor the general health of the ModelMesh Serving deployment. The new PayloadProcessor interface can be implemented to log prediction requests and responses, to create data sinks for data visualization, for model quality assessment, or for drift and outlier detection by external monitoring systems.","title":"Model Metrics, Metrics Dashboard, Payload Event Logging"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#whats-changed","text":"To allow longer InferenceService name due to DNS max length limits from issue , the Default suffix in the inference service component(predictor/transformer/explainer) name has been removed for newly created InferenceServices. This affects the client that is using the component url directly instead of the top level InferenceService url. Status.address.url is now consistent for both serverless and raw deployment mode, the url path portion is dropped in serverless mode. Raw bytes are now accepted in v1 protocol, setting the right content-type header to application/json is required to recognize and decode the json payload if content-type is specified. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json For a complete change list please read the release notes from KServe v0.11 and ModelMesh v0.11 .","title":"What's Changed?"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.11 release! The KServe Working Group","title":"Join the community"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/","text":"From Serverless Predictive Inference to Generative Inference: Introducing KServe v0.13 \u00b6 We are excited to unveil KServe v0.13, marking a significant leap forward in evolving cloud native model serving to meet the demands of Generative AI inference. This release is highlighted by three pivotal updates: enhanced Hugging Face runtime, robust vLLM backend support for Generative Models, and the integration of OpenAI protocol standards. Below are a summary of the key changes. Enhanced Hugging Face Runtime Support \u00b6 KServe v0.13 enriches its Hugging Face runtime and now supports running Hugging Face models out-of-the-box. KServe v0.13 implements a KServe Hugging Face Serving Runtime , kserve-huggingfaceserver . With this implementation, KServe can now automatically infer a task from model architecture and select the optimized serving runtime. Currently supported tasks include sequence classification, token classification, fill mask, text generation, and text to text generation. Here is an example to serve BERT model by deploying an Inference Service with Hugging Face runtime for classification task. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=bert-base-uncased - --tensor_input_names=input_ids resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : 100m memory : 2Gi nvidia.com/gpu : \"1\" You can also deploy BERT on the more optimized inference runtime like Triton using Hugging Face Runtime for pre/post processing, see more details here . vLLM support \u00b6 Version 0.13 introduces dedicated runtime support for vLLM , for enhanced transformer model serving. This support now includes auto-mapping vLLMs as the backend for supported tasks, streamlining the deployment process and optimizing performance. If vLLM does not support a particular task, it will default to the Hugging Face backend. See example below. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" See more details in our updated docs to Deploy the Llama3 model with Hugging Face LLM Serving Runtime . Additionally, if the Hugging Face backend is preferred over vLLM, vLLM auto-mapping can be disabled with the --backend=huggingface arg. OpenAI Schema Integration \u00b6 Embracing the OpenAI protocol, KServe v0.13 now supports three specific endpoints for generative transformer models: /openai/v1/completions /openai/v1/chat/completions /openai/v1/models These endpoints are useful for generative transformer models, which take in messages and return a model-generated message output. The chat completions endpoint is designed for easily handling multi-turn conversations, while still being useful for single-turn tasks. The completions endpoint is now a legacy endpoint that differs with the chat completions endpoint in that the interface for completions is a freeform text string called a prompt . Read more about the chat completions and completions endpoints int the OpenAI API docs. This update fosters a standardized approach to transformer model serving, ensuring compatibility with a broader spectrum of models and tools, and enhances the platform's versatility. The API can be directly used with OpenAI's client libraries or third-party tools, like LangChain or LlamaIndex. Future Plan \u00b6 Support other tasks like text embeddings #3572 . Support more LLM backend options in the future, such as TensorRT-LLM. Enrich text generation metrics for Throughput(tokens/sec), TTFT(Time to first token) #3461 . KEDA integration for token based LLM Autoscaling #3561 . Other Changes \u00b6 This release also includes several enhancements and changes: What's New? \u00b6 Async streaming support for v1 endpoints #3402 . Support for .json and .ubj model formats in the XGBoost server image #3546 . Enhanced flexibility in KServe by allowing the configuration of multiple domains for an inference service #2747 . Enhanced the manager setup to dynamically adapt based on available CRDs, improving operational flexibility and reliability across different deployment environments #3470 . What's Changed? \u00b6 Removed Seldon Alibi dependency #3380 . Removal of conversion webhook from manifests. #3344 . For complete details on the new features and updates, visit our official release notes . Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.13 release! The KServe Project","title":"KServe 0.13 Release"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#from-serverless-predictive-inference-to-generative-inference-introducing-kserve-v013","text":"We are excited to unveil KServe v0.13, marking a significant leap forward in evolving cloud native model serving to meet the demands of Generative AI inference. This release is highlighted by three pivotal updates: enhanced Hugging Face runtime, robust vLLM backend support for Generative Models, and the integration of OpenAI protocol standards. Below are a summary of the key changes.","title":"From Serverless Predictive Inference to Generative Inference: Introducing KServe v0.13"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#enhanced-hugging-face-runtime-support","text":"KServe v0.13 enriches its Hugging Face runtime and now supports running Hugging Face models out-of-the-box. KServe v0.13 implements a KServe Hugging Face Serving Runtime , kserve-huggingfaceserver . With this implementation, KServe can now automatically infer a task from model architecture and select the optimized serving runtime. Currently supported tasks include sequence classification, token classification, fill mask, text generation, and text to text generation. Here is an example to serve BERT model by deploying an Inference Service with Hugging Face runtime for classification task. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=bert-base-uncased - --tensor_input_names=input_ids resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : 100m memory : 2Gi nvidia.com/gpu : \"1\" You can also deploy BERT on the more optimized inference runtime like Triton using Hugging Face Runtime for pre/post processing, see more details here .","title":"Enhanced Hugging Face Runtime Support"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#vllm-support","text":"Version 0.13 introduces dedicated runtime support for vLLM , for enhanced transformer model serving. This support now includes auto-mapping vLLMs as the backend for supported tasks, streamlining the deployment process and optimizing performance. If vLLM does not support a particular task, it will default to the Hugging Face backend. See example below. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" See more details in our updated docs to Deploy the Llama3 model with Hugging Face LLM Serving Runtime . Additionally, if the Hugging Face backend is preferred over vLLM, vLLM auto-mapping can be disabled with the --backend=huggingface arg.","title":"vLLM support"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#openai-schema-integration","text":"Embracing the OpenAI protocol, KServe v0.13 now supports three specific endpoints for generative transformer models: /openai/v1/completions /openai/v1/chat/completions /openai/v1/models These endpoints are useful for generative transformer models, which take in messages and return a model-generated message output. The chat completions endpoint is designed for easily handling multi-turn conversations, while still being useful for single-turn tasks. The completions endpoint is now a legacy endpoint that differs with the chat completions endpoint in that the interface for completions is a freeform text string called a prompt . Read more about the chat completions and completions endpoints int the OpenAI API docs. This update fosters a standardized approach to transformer model serving, ensuring compatibility with a broader spectrum of models and tools, and enhances the platform's versatility. The API can be directly used with OpenAI's client libraries or third-party tools, like LangChain or LlamaIndex.","title":"OpenAI Schema Integration"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#future-plan","text":"Support other tasks like text embeddings #3572 . Support more LLM backend options in the future, such as TensorRT-LLM. Enrich text generation metrics for Throughput(tokens/sec), TTFT(Time to first token) #3461 . KEDA integration for token based LLM Autoscaling #3561 .","title":"Future Plan"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#other-changes","text":"This release also includes several enhancements and changes:","title":"Other Changes"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#whats-new","text":"Async streaming support for v1 endpoints #3402 . Support for .json and .ubj model formats in the XGBoost server image #3546 . Enhanced flexibility in KServe by allowing the configuration of multiple domains for an inference service #2747 . Enhanced the manager setup to dynamically adapt based on available CRDs, improving operational flexibility and reliability across different deployment environments #3470 .","title":"What's New?"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#whats-changed","text":"Removed Seldon Alibi dependency #3380 . Removal of conversion webhook from manifests. #3344 . For complete details on the new features and updates, visit our official release notes .","title":"What's Changed?"},{"location":"blog/articles/2024-05-15-KServe-0.13-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.13 release! The KServe Project","title":"Join the community"},{"location":"blog/articles/_index/","text":"","title":" index"},{"location":"community/adopters/","text":"Adopters of KServe \u00b6 This page contains a list of organizations who are using KServe either in production, or providing integrations or deployment options with their Cloud or product offerings. If you'd like to be included here, please send a pull request which modifies this file. Please keep the list in alphabetical order. Organization Contact Advanced Micro Devices Varun Sharma Alauda Wu Yi Amazon Web Services Ellis Tarn Bloomberg Dan Sun Cars24 Swapnesh Khare Charmed Kubeflow from Canonical Daniela Plasencia Cisco Krishna Durai Cloudera Zoram Thanga CoreWeave Peter Salanki Deeploy Tim Kleinloog Gojek Willem Pienaar Halodoc ID Joinal Ahmed Hewlett Packard Enterprise (HPE) Jerry Harrow Hypermode Kevin Mingtarja IBM Nick Hill Inspur Qingshan Chen Intuit Rachit Chauhan Kubeflow on Google Cloud James Liu Max Kelsen Jacob O'Farrell Naver Mark Winter Nuance Jeff Griffith NVIDIA David Goodwin One Convergence Subra Ongole PITS Global Data Recovery Services Pheianox Red Hat Taneem Ibrahim Seldon Alex Housley Patterson Consulting Josh Patterson Samsung SDS Hanbae Seo Striveworks Jordan Yono Upstage JuHyung Son Zillow Peilun Li","title":"Adopters"},{"location":"community/adopters/#adopters-of-kserve","text":"This page contains a list of organizations who are using KServe either in production, or providing integrations or deployment options with their Cloud or product offerings. If you'd like to be included here, please send a pull request which modifies this file. Please keep the list in alphabetical order. Organization Contact Advanced Micro Devices Varun Sharma Alauda Wu Yi Amazon Web Services Ellis Tarn Bloomberg Dan Sun Cars24 Swapnesh Khare Charmed Kubeflow from Canonical Daniela Plasencia Cisco Krishna Durai Cloudera Zoram Thanga CoreWeave Peter Salanki Deeploy Tim Kleinloog Gojek Willem Pienaar Halodoc ID Joinal Ahmed Hewlett Packard Enterprise (HPE) Jerry Harrow Hypermode Kevin Mingtarja IBM Nick Hill Inspur Qingshan Chen Intuit Rachit Chauhan Kubeflow on Google Cloud James Liu Max Kelsen Jacob O'Farrell Naver Mark Winter Nuance Jeff Griffith NVIDIA David Goodwin One Convergence Subra Ongole PITS Global Data Recovery Services Pheianox Red Hat Taneem Ibrahim Seldon Alex Housley Patterson Consulting Josh Patterson Samsung SDS Hanbae Seo Striveworks Jordan Yono Upstage JuHyung Son Zillow Peilun Li","title":"Adopters of KServe"},{"location":"community/get_involved/","text":"How to Get Involved \u00b6 Welcome to the KServe community! Feel free to ask questions, engage in discussions, or get involved in the KServe's development. KServe, as an open-source project, thrives on the active participation of its community. Let's work together to make machine learning model serving effortless. Join us! How do you want to get involved? \u00b6 Ask Questions \u00b6 For the fastest response, you can ask questions on the #kserve channel of the CNCF Slack . To Join the channel, Create your CNCF Slack account and Search for the #kserve channel or join via this link . If you prefer to use GitHub discussions, you can join the KServe discussions . Bug Reports and Feature Requests \u00b6 We use GitHub Issues to track bug reports and feature requests. Please file your issues and feature requests in the KServe main repository . For Documentation related issues, please use the KServe Website repository . For Open Inference Protocol (V2) related issues and feature requests, please use Open Inference Protocol repository A good bug report should include: Description: Clearly state what you were trying to accomplish and what behavior you observed instead Versions: Specify the versions of relevant components KServe version Knative version (If using Serverless) Kubeflow version (If used with Kubeflow) Kubernetes version Cloud provider details (if using a cloud provider, indicate which one) Relevant resource yaml, HTTP requests, or log lines Vulnerability Reports \u00b6 We strongly encourage you to report security vulnerabilities privately, before disclosing them in any public forums. Only the active maintainers and KServe security group members will receive the reported security vulnerabilities and the issues are treated as top priority. You can use the following ways to report security vulnerabilities privately: Using our private security mailing list: kserve-security@lists.lfaidata.foundation . Using the KServe repository GitHub Security Advisory Become a Contributor \u00b6 This is the place to start your journey as a contributor\u2014whether it's enhancing code, improving documentation. KServe welcomes your contribution! If you're interested in becoming a KServe contributor, you'll want to check out our developer guide . Communication Channels \u00b6 Much of the community meets on the CNCF Slack , using the following channels: #kserve : General discussion about KServe usage #kserve-contributors : General discussion channel for folks contributing to the KServe project in any capacity #kserve-oip-collaboration : Discussion area for Open Inference Protocol and API standardization Community Meetings \u00b6 We have public KServe WG biweekly community meetings on Wed 9AM US/Pacific and a public monthly Open Inference Protocol WG meeting on Wed 10AM US/Pacific. KServe WG Meeting agendas and notes can be accessed in the working group document . Open Inference Protocol WG meeting minutes from the monthly work group sessions can be accessed in the working group document . You can access the meeting recordings on the community calendar by clicking on the respective date's event details. Stay tuned for new events by subscribing to the community calendar ( iCal export file ).","title":"How to Get Involved"},{"location":"community/get_involved/#how-to-get-involved","text":"Welcome to the KServe community! Feel free to ask questions, engage in discussions, or get involved in the KServe's development. KServe, as an open-source project, thrives on the active participation of its community. Let's work together to make machine learning model serving effortless. Join us!","title":"How to Get Involved"},{"location":"community/get_involved/#how-do-you-want-to-get-involved","text":"","title":"How do you want to get involved?"},{"location":"community/get_involved/#ask-questions","text":"For the fastest response, you can ask questions on the #kserve channel of the CNCF Slack . To Join the channel, Create your CNCF Slack account and Search for the #kserve channel or join via this link . If you prefer to use GitHub discussions, you can join the KServe discussions .","title":"Ask Questions"},{"location":"community/get_involved/#bug-reports-and-feature-requests","text":"We use GitHub Issues to track bug reports and feature requests. Please file your issues and feature requests in the KServe main repository . For Documentation related issues, please use the KServe Website repository . For Open Inference Protocol (V2) related issues and feature requests, please use Open Inference Protocol repository A good bug report should include: Description: Clearly state what you were trying to accomplish and what behavior you observed instead Versions: Specify the versions of relevant components KServe version Knative version (If using Serverless) Kubeflow version (If used with Kubeflow) Kubernetes version Cloud provider details (if using a cloud provider, indicate which one) Relevant resource yaml, HTTP requests, or log lines","title":"Bug Reports and Feature Requests"},{"location":"community/get_involved/#vulnerability-reports","text":"We strongly encourage you to report security vulnerabilities privately, before disclosing them in any public forums. Only the active maintainers and KServe security group members will receive the reported security vulnerabilities and the issues are treated as top priority. You can use the following ways to report security vulnerabilities privately: Using our private security mailing list: kserve-security@lists.lfaidata.foundation . Using the KServe repository GitHub Security Advisory","title":"Vulnerability Reports"},{"location":"community/get_involved/#become-a-contributor","text":"This is the place to start your journey as a contributor\u2014whether it's enhancing code, improving documentation. KServe welcomes your contribution! If you're interested in becoming a KServe contributor, you'll want to check out our developer guide .","title":"Become a Contributor"},{"location":"community/get_involved/#communication-channels","text":"Much of the community meets on the CNCF Slack , using the following channels: #kserve : General discussion about KServe usage #kserve-contributors : General discussion channel for folks contributing to the KServe project in any capacity #kserve-oip-collaboration : Discussion area for Open Inference Protocol and API standardization","title":"Communication Channels"},{"location":"community/get_involved/#community-meetings","text":"We have public KServe WG biweekly community meetings on Wed 9AM US/Pacific and a public monthly Open Inference Protocol WG meeting on Wed 10AM US/Pacific. KServe WG Meeting agendas and notes can be accessed in the working group document . Open Inference Protocol WG meeting minutes from the monthly work group sessions can be accessed in the working group document . You can access the meeting recordings on the community calendar by clicking on the respective date's event details. Stay tuned for new events by subscribing to the community calendar ( iCal export file ).","title":"Community Meetings"},{"location":"community/presentations/","text":"KServe(Formally KFServing) Presentations and Demoes \u00b6 This page contains a list of presentations and demos. If you'd like to add a presentation or demo here, please send a pull request. Presentation/Demo Presenters Optimizing Load Balancing and Autoscaling for Large Language Model (LLM) Inference on Kubernetes David Gray Engaging the KServe Community, The Impact of Integrating a Solutions with Standardized CNCF Projects Adam Tetelman, Taneem Ibrahim, Johnu George, Tessa Pham, Andreea Munteanu Advancing Cloud Native AI Innovation Through Open Collaboration Yuan Tang Unlocking Potential of Large Models in Production Yuan Tang, Adam Tetelman WG Serving: Accelerating AI/ML Inference Workloads on Kubernetes Yuan Tang, Eduardo Arango Gutierrez Best Practices for Deploying LLM Inference, RAG and Fine Tuning Pipelines Meenakshi Kaushik, Shiva Krishna Merla From Bash Scripts to Kubeflow and GitOps: Our Journey to Operationalizing ML at Scale Luca Grazioli, Dennis Ohrndorf Production-Ready AI Platform on Kubernetes Yuan Tang Fortifying AI Security in Kubernetes with Confidential Containers Suraj Deshmukh, Pradipta Banerjee Platform Building Blocks: How to Build ML Infrastructure with CNCF Projects Yuzhui Liu, Leon Zhou Distributed Machine Learning Patterns from Manning Publications Yuan Tang KubeCon 2019: Introducing KFServing: Serverless Model Serving on Kubernetes Dan Sun, Ellis Tarn KubeCon 2019: Advanced Model Inferencing Leveraging KNative, Istio & Kubeflow Serving Animesh Singh, Clive Cox KubeflowDojo: KFServing - Production Model Serving Platform Animesh Singh, Tommy Li NVIDIA: Accelerate and Autoscale Deep Learning Inference on GPUs with KFServing Dan Sun, David Goodwin KF Community: KFServing - Enabling Serverless Workloads Across Model Frameworks Ellis Tarn KubeflowDojo: Demo - KFServing End to End through Notebook Animesh Singh, Tommy Li KubeflowDojo: Demo - KFServing with Kafka and Kubeflow Pipelines Animesh Singh Anchor MLOps Podcast: Serving Models with KFServing David Aponte, Demetrios Brinkmann Kubeflow 101: What is KFServing? Stephanie Wong ICML 2020, Workshop on Challenges in Deploying and Monitoring Machine Learning Systems : Serverless inferencing on Kubernetes Clive Cox Serverless Practitioners Summit 2020: Serverless Machine Learning Inference with KFServing Clive Cox, Yuzhui Liu MLOps Meetup: KServe Live Coding Session Theofilos Papapanagiotou KubeCon AI Days 2021: Serving Machine Learning Models at Scale Using KServe Yuzhui Liu KubeCon 2021: Serving Machine Learning Models at Scale Using KServe Animesh Singh KubeCon China 2021: Accelerate Federated Learning Model Deployment with KServe Fangchi Wang & Jiahao Chen KubeCon AI Days 2022: Exploring ML Model Serving with KServe Alexa Nicole Griffith KubeCon AI Days 2022: Enhancing the Performance Testing Process for gRPC Model Inferencing at Scale Ted Chang, Paul Van Eck KubeCon Edge Days 2022: Model Serving at the Edge Made Easier Paul Van Eck KnativeCon 2022: How We Built an ML inference Platform with Knative Dan Sun KubeCon EU 2023: The state and future of cloud native model serving Dan Sun, Theofilos Papapanagiotou Kubeflow Summit 2023: Scale your Models to Zero with Knative and KServe Jooho Lee Kubeflow Summit 2023: What to choose? ModelMesh vs Model Serving? Vaibhav Jain","title":"Demos and Presentations"},{"location":"community/presentations/#kserveformally-kfserving-presentations-and-demoes","text":"This page contains a list of presentations and demos. If you'd like to add a presentation or demo here, please send a pull request. Presentation/Demo Presenters Optimizing Load Balancing and Autoscaling for Large Language Model (LLM) Inference on Kubernetes David Gray Engaging the KServe Community, The Impact of Integrating a Solutions with Standardized CNCF Projects Adam Tetelman, Taneem Ibrahim, Johnu George, Tessa Pham, Andreea Munteanu Advancing Cloud Native AI Innovation Through Open Collaboration Yuan Tang Unlocking Potential of Large Models in Production Yuan Tang, Adam Tetelman WG Serving: Accelerating AI/ML Inference Workloads on Kubernetes Yuan Tang, Eduardo Arango Gutierrez Best Practices for Deploying LLM Inference, RAG and Fine Tuning Pipelines Meenakshi Kaushik, Shiva Krishna Merla From Bash Scripts to Kubeflow and GitOps: Our Journey to Operationalizing ML at Scale Luca Grazioli, Dennis Ohrndorf Production-Ready AI Platform on Kubernetes Yuan Tang Fortifying AI Security in Kubernetes with Confidential Containers Suraj Deshmukh, Pradipta Banerjee Platform Building Blocks: How to Build ML Infrastructure with CNCF Projects Yuzhui Liu, Leon Zhou Distributed Machine Learning Patterns from Manning Publications Yuan Tang KubeCon 2019: Introducing KFServing: Serverless Model Serving on Kubernetes Dan Sun, Ellis Tarn KubeCon 2019: Advanced Model Inferencing Leveraging KNative, Istio & Kubeflow Serving Animesh Singh, Clive Cox KubeflowDojo: KFServing - Production Model Serving Platform Animesh Singh, Tommy Li NVIDIA: Accelerate and Autoscale Deep Learning Inference on GPUs with KFServing Dan Sun, David Goodwin KF Community: KFServing - Enabling Serverless Workloads Across Model Frameworks Ellis Tarn KubeflowDojo: Demo - KFServing End to End through Notebook Animesh Singh, Tommy Li KubeflowDojo: Demo - KFServing with Kafka and Kubeflow Pipelines Animesh Singh Anchor MLOps Podcast: Serving Models with KFServing David Aponte, Demetrios Brinkmann Kubeflow 101: What is KFServing? Stephanie Wong ICML 2020, Workshop on Challenges in Deploying and Monitoring Machine Learning Systems : Serverless inferencing on Kubernetes Clive Cox Serverless Practitioners Summit 2020: Serverless Machine Learning Inference with KFServing Clive Cox, Yuzhui Liu MLOps Meetup: KServe Live Coding Session Theofilos Papapanagiotou KubeCon AI Days 2021: Serving Machine Learning Models at Scale Using KServe Yuzhui Liu KubeCon 2021: Serving Machine Learning Models at Scale Using KServe Animesh Singh KubeCon China 2021: Accelerate Federated Learning Model Deployment with KServe Fangchi Wang & Jiahao Chen KubeCon AI Days 2022: Exploring ML Model Serving with KServe Alexa Nicole Griffith KubeCon AI Days 2022: Enhancing the Performance Testing Process for gRPC Model Inferencing at Scale Ted Chang, Paul Van Eck KubeCon Edge Days 2022: Model Serving at the Edge Made Easier Paul Van Eck KnativeCon 2022: How We Built an ML inference Platform with Knative Dan Sun KubeCon EU 2023: The state and future of cloud native model serving Dan Sun, Theofilos Papapanagiotou Kubeflow Summit 2023: Scale your Models to Zero with Knative and KServe Jooho Lee Kubeflow Summit 2023: What to choose? ModelMesh vs Model Serving? Vaibhav Jain","title":"KServe(Formally KFServing) Presentations and Demoes"},{"location":"developer/debug/","text":"KServe Debugging Guide \u00b6 Debug KServe InferenceService Status \u00b6 You deployed an InferenceService to KServe, but it is not in ready state. Go through this step by step guide to understand what failed. kubectl get inferenceservices sklearn-iris NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE model-example False 1m IngressNotConfigured \u00b6 If you see IngressNotConfigured error, this indicates Istio Ingress Gateway probes are failing. kubectl get ksvc NAME URL LATESTCREATED LATESTREADY READY REASON sklearn-iris-predictor-default http://sklearn-iris-predictor-default.default.example.com sklearn-iris-predictor-default-jk794 mnist-sample-predictor-default-jk794 Unknown IngressNotConfigured You can then check Knative networking-istio pod logs for more details. kubectl logs -l app = networking-istio -n knative-serving If you are seeing HTTP 403, then you may have Istio RBAC turned on which blocks the probes to your service. { \"level\" : \"error\" , \"ts\" : \"2020-03-26T19:12:00.749Z\" , \"logger\" : \"istiocontroller.ingress-controller.status-manager\" , \"caller\" : \"ingress/status.go:366\" , \"msg\" : \"Probing of http://flowers-sample-predictor-default.kubeflow-jeanarmel-luce.example.com:80/ failed, IP: 10.0.0.29:80, ready: false, error: unexpected status code: want [200], got 403 (depth: 0)\" , \"commit\" : \"6b0e5c6\" , \"knative.dev/controller\" : \"ingress-controller\" , \"stacktrace\" : \"knative.dev/serving/pkg/reconciler/ingress.(*StatusProber).processWorkItem\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:366\\nknative.dev/serving/pkg/reconciler/ingress.(*StatusProber).Start.func1\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:268\" } RevisionMissing Error \u00b6 If you see RevisionMissing error, then your service pods are not in ready state. Knative Service creates Knative Revision which represents a snapshot of the InferenceService code and configuration. Storage Initializer fails to download model \u00b6 kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-csjpw sklearn-iris-predictor-default sklearn-iris-predictor-default-csjpw 2 Unknown Deploying If you see READY status in Unknown error, this usually indicates that the KServe Storage Initializer init container fails to download the model and you can check the init container logs to see why it fails, note that the pod scales down after sometime if the init container fails . kubectl get pod -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-29jks-deployment-5f7d4b9996hzrnc 0 /3 Init:Error 1 10s kubectl logs -l model = sklearn-iris -c storage-initializer [ I 200517 03 :56:19 initializer-entrypoint:13 ] Initializing, args: src_uri [ gs://kfserving-examples/models/sklearn/iris-1 ] dest_path [ [ /mnt/models ] [ I 200517 03 :56:19 storage:35 ] Copying contents of gs://kfserving-examples/models/sklearn/iris-1 to local Traceback ( most recent call last ) : File \"/storage-initializer/scripts/initializer-entrypoint\" , line 14 , in <module> kserve.Storage.download ( src_uri, dest_path ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 48 , in download Storage._download_gcs ( uri, out_dir ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 116 , in _download_gcs The path or model %s does not exist. \" % (uri)) RuntimeError: Failed to fetch model. The path or model gs://kfserving-examples/models/sklearn/iris-1 does not exist. [I 200517 03:40:19 initializer-entrypoint:13] Initializing, args: src_uri [gs://kfserving-examples/models/sklearn/iris] dest_path[ [/mnt/models] [I 200517 03:40:19 storage:35] Copying contents of gs://kfserving-examples/models/sklearn/iris to local [I 200517 03:40:20 storage:111] Downloading: /mnt/models/model.joblib [I 200517 03:40:20 storage:60] Successfully copied gs://kfserving-examples/models/sklearn/iris to /mnt/models Inference Service in OOM status \u00b6 If you see ExitCode137 from the revision status, this means the revision has failed and this usually happens when the inference service pod is out of memory. To address it, you might need to bump up the memory limit of the InferenceService . kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-84bzf sklearn-iris-predictor-default sklearn-iris-predictor-default-84bzf 8 False ExitCode137s Inference Service fails to start \u00b6 If you see other exit codes from the revision status you can further check the pod status. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n 1 /3 CrashLoopBackOff 3 80s If you see the CrashLoopBackOff , then check the kserve-container log to see more details where it fails, the error log is usually propagated on revision container status also. kubectl logs sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n kserve-container [ I 200517 04 :58:21 storage:35 ] Copying contents of /mnt/models to local Traceback ( most recent call last ) : File \"/usr/local/lib/python3.7/runpy.py\" , line 193 , in _run_module_as_main \"__main__\" , mod_spec ) File \"/usr/local/lib/python3.7/runpy.py\" , line 85 , in _run_code exec ( code, run_globals ) File \"/sklearnserver/sklearnserver/__main__.py\" , line 33 , in <module> model.load () File \"/sklearnserver/sklearnserver/model.py\" , line 36 , in load model_file = next ( path for path in paths if os.path.exists ( path )) StopIteration Inference Service cannot fetch docker images from AWS ECR \u00b6 If you don't see the inference service created at all for custom images from private registries (such as AWS ECR), it might be that the Knative Serving Controller fails to authenticate itself against the registry. failed to resolve image to digest: failed to fetch image information: unsupported status code 401 ; body: Not Authorized You can verify that this is actually the case by spinning up a pod that uses your image. The pod should be able to fetch it, if the correct IAM roles are attached, while Knative is not able to. To circumvent this issue you can either skip tag resolution or provide certificates for your registry as detailed in the official knative docs . kubectl -n knative-serving edit configmap config-deployment The resultant yaml will look like something below. apiVersion : v1 kind : ConfigMap metadata : name : config-deployment namespace : knative-serving data : # List of repositories for which tag to digest resolving should be skipped (for AWS ECR: {account_id}.dkr.ecr.{region}.amazonaws.com) registriesSkippingTagResolving : registry.example.com Debug KServe Request flow \u00b6 +----------------------+ +-----------------------+ +--------------------------+ |Istio Virtual Service | |Istio Virtual Service | | K8S Service | | | | | | | |sklearn-iris | |sklearn-iris-predictor | | sklearn-iris-predictor | | +------->|-default +----->| -default-$revision | | | | | | | |KServe Route | |Knative Route | | Knative Revision Service | +----------------------+ +-----------------------+ +------------+-------------+ Knative Ingress Gateway Knative Local Gateway Kube Proxy (Istio gateway) (Istio gateway) | | | +-------------------------------------------------------+ | | Knative Revision Pod | | | | | | +-------------------+ +-----------------+ | | | | | | | | | | |kserve-container |<-----+ Queue Proxy | |<------------------+ | | | | | | | +-------------------+ +--------------^--+ | | | | +-----------------------^-------------------------------+ | scale deployment | +--------+--------+ | pull metrics | Knative | | | Autoscaler |----------- | KPA/HPA | +-----------------+ 1.Traffic arrives through Knative Ingress/Local Gateway for external/internal traffic \u00b6 Istio Gateway resource describes the edge of the mesh receiving incoming or outgoing HTTP/TCP connections. The specification describes a set of ports that should be exposed and the type of protocol to use. If you are using Standalone mode, it installs the Gateway in knative-serving namespace, if you are using Kubeflow KServe (KServe installed with Kubeflow), it installs the Gateway in kubeflow namespace e.g on GCP the gateway is protected behind IAP with Istio authentication policy . kubectl get gateway knative-ingress-gateway -n knative-serving -oyaml kind : Gateway metadata : labels : networking.knative.dev/ingress-provider : istio serving.knative.dev/release : v0.12.1 name : knative-ingress-gateway namespace : knative-serving spec : selector : istio : ingressgateway servers : - hosts : - '*' port : name : http number : 80 protocol : HTTP - hosts : - '*' port : name : https number : 443 protocol : HTTPS tls : mode : SIMPLE privateKey : /etc/istio/ingressgateway-certs/tls.key serverCertificate : /etc/istio/ingressgateway-certs/tls.crt The InferenceService request routes to the Istio Ingress Gateway by matching the host and port from the url, by default http is configured, you can configure HTTPS with TLS certificates . 2. KServe Istio virtual service to route for predictor, transformer, explainer. \u00b6 kubectl get vs sklearn-iris -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris namespace : default gateways : - knative-serving/knative-local-gateway - knative-serving/knative-ingress-gateway hosts : - sklearn-iris.default.svc.cluster.local - sklearn-iris.default.example.com http : - headers : request : set : Host : sklearn-iris-predictor-default.default.svc.cluster.local match : - authority : regex : ^sklearn-iris\\.default(\\.svc(\\.cluster\\.local)?)?(?::\\d{1,5})?$ gateways : - knative-serving/knative-local-gateway - authority : regex : ^sklearn-iris\\.default\\.example\\.com(?::\\d{1,5})?$ gateways : - knative-serving/knative-ingress-gateway route : - destination : host : knative-local-gateway.istio-system.svc.cluster.local port : number : 80 weight : 100 KServe creates the routing rule which by default routes to Predictor if you only have Predictor specified on InferenceService . When Transformer and Explainer are specified on InferenceService the routing rule configures the traffic to route to Transformer or Explainer based on the verb. The request then routes to the second level Knative created virtual service via local gateway with the matching host header. 3. Knative Istio virtual service to route the inference request to the latest ready revision. \u00b6 kubectl get vs sklearn-iris-predictor-default-ingress -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris-predictor-default-mesh namespace : default spec : gateways : - knative-serving/knative-ingress-gateway - knative-serving/knative-local-gateway hosts : - sklearn-iris-predictor-default.default - sklearn-iris-predictor-default.default.example.com - sklearn-iris-predictor-default.default.svc - sklearn-iris-predictor-default.default.svc.cluster.local http : - match : - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default.svc gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 - match : - authority : prefix : sklearn-iris-predictor-default.default.example.com gateways : - knative-serving/knative-ingress-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 The destination here is the k8s Service for the latest ready Knative Revision and it is reconciled by Knative every time user rolls out a new revision. When a new revision is rolled out and in ready state, the old revision is then scaled down, after configured revision GC time the revision resource is garbage collected if the revision no longer has traffic referenced. 4. Kubernetes Service routes the requests to the queue proxy sidecar of the inference service pod on port 8012 . \u00b6 kubectl get svc sklearn-iris-predictor-default-fhmjk-private -oyaml apiVersion : v1 kind : Service metadata : name : sklearn-iris-predictor-default-fhmjk-private namespace : default spec : clusterIP : 10.105.186.18 ports : - name : http port : 80 protocol : TCP targetPort : 8012 - name : queue-metrics port : 9090 protocol : TCP targetPort : queue-metrics - name : http-usermetric port : 9091 protocol : TCP targetPort : http-usermetric - name : http-queueadm port : 8022 protocol : TCP targetPort : 8022 selector : serving.knative.dev/revisionUID : a8f1eafc-3c64-4930-9a01-359f3235333a sessionAffinity : None type : ClusterIP 5. The queue proxy routes to kserve container with max concurrent requests configured with ContainerConcurrency . \u00b6 If the queue proxy has more requests than it can handle, the Knative Autoscaler creates more pods to handle additional requests. 6. Finally The queue proxy routes traffic to the kserve-container for processing the inference requests. \u00b6","title":"Debugging guide"},{"location":"developer/debug/#kserve-debugging-guide","text":"","title":"KServe Debugging Guide"},{"location":"developer/debug/#debug-kserve-inferenceservice-status","text":"You deployed an InferenceService to KServe, but it is not in ready state. Go through this step by step guide to understand what failed. kubectl get inferenceservices sklearn-iris NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE model-example False 1m","title":"Debug KServe InferenceService Status"},{"location":"developer/debug/#ingressnotconfigured","text":"If you see IngressNotConfigured error, this indicates Istio Ingress Gateway probes are failing. kubectl get ksvc NAME URL LATESTCREATED LATESTREADY READY REASON sklearn-iris-predictor-default http://sklearn-iris-predictor-default.default.example.com sklearn-iris-predictor-default-jk794 mnist-sample-predictor-default-jk794 Unknown IngressNotConfigured You can then check Knative networking-istio pod logs for more details. kubectl logs -l app = networking-istio -n knative-serving If you are seeing HTTP 403, then you may have Istio RBAC turned on which blocks the probes to your service. { \"level\" : \"error\" , \"ts\" : \"2020-03-26T19:12:00.749Z\" , \"logger\" : \"istiocontroller.ingress-controller.status-manager\" , \"caller\" : \"ingress/status.go:366\" , \"msg\" : \"Probing of http://flowers-sample-predictor-default.kubeflow-jeanarmel-luce.example.com:80/ failed, IP: 10.0.0.29:80, ready: false, error: unexpected status code: want [200], got 403 (depth: 0)\" , \"commit\" : \"6b0e5c6\" , \"knative.dev/controller\" : \"ingress-controller\" , \"stacktrace\" : \"knative.dev/serving/pkg/reconciler/ingress.(*StatusProber).processWorkItem\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:366\\nknative.dev/serving/pkg/reconciler/ingress.(*StatusProber).Start.func1\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:268\" }","title":"IngressNotConfigured"},{"location":"developer/debug/#revisionmissing-error","text":"If you see RevisionMissing error, then your service pods are not in ready state. Knative Service creates Knative Revision which represents a snapshot of the InferenceService code and configuration.","title":"RevisionMissing Error"},{"location":"developer/debug/#storage-initializer-fails-to-download-model","text":"kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-csjpw sklearn-iris-predictor-default sklearn-iris-predictor-default-csjpw 2 Unknown Deploying If you see READY status in Unknown error, this usually indicates that the KServe Storage Initializer init container fails to download the model and you can check the init container logs to see why it fails, note that the pod scales down after sometime if the init container fails . kubectl get pod -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-29jks-deployment-5f7d4b9996hzrnc 0 /3 Init:Error 1 10s kubectl logs -l model = sklearn-iris -c storage-initializer [ I 200517 03 :56:19 initializer-entrypoint:13 ] Initializing, args: src_uri [ gs://kfserving-examples/models/sklearn/iris-1 ] dest_path [ [ /mnt/models ] [ I 200517 03 :56:19 storage:35 ] Copying contents of gs://kfserving-examples/models/sklearn/iris-1 to local Traceback ( most recent call last ) : File \"/storage-initializer/scripts/initializer-entrypoint\" , line 14 , in <module> kserve.Storage.download ( src_uri, dest_path ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 48 , in download Storage._download_gcs ( uri, out_dir ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 116 , in _download_gcs The path or model %s does not exist. \" % (uri)) RuntimeError: Failed to fetch model. The path or model gs://kfserving-examples/models/sklearn/iris-1 does not exist. [I 200517 03:40:19 initializer-entrypoint:13] Initializing, args: src_uri [gs://kfserving-examples/models/sklearn/iris] dest_path[ [/mnt/models] [I 200517 03:40:19 storage:35] Copying contents of gs://kfserving-examples/models/sklearn/iris to local [I 200517 03:40:20 storage:111] Downloading: /mnt/models/model.joblib [I 200517 03:40:20 storage:60] Successfully copied gs://kfserving-examples/models/sklearn/iris to /mnt/models","title":"Storage Initializer fails to download model"},{"location":"developer/debug/#inference-service-in-oom-status","text":"If you see ExitCode137 from the revision status, this means the revision has failed and this usually happens when the inference service pod is out of memory. To address it, you might need to bump up the memory limit of the InferenceService . kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-84bzf sklearn-iris-predictor-default sklearn-iris-predictor-default-84bzf 8 False ExitCode137s","title":"Inference Service in OOM status"},{"location":"developer/debug/#inference-service-fails-to-start","text":"If you see other exit codes from the revision status you can further check the pod status. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n 1 /3 CrashLoopBackOff 3 80s If you see the CrashLoopBackOff , then check the kserve-container log to see more details where it fails, the error log is usually propagated on revision container status also. kubectl logs sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n kserve-container [ I 200517 04 :58:21 storage:35 ] Copying contents of /mnt/models to local Traceback ( most recent call last ) : File \"/usr/local/lib/python3.7/runpy.py\" , line 193 , in _run_module_as_main \"__main__\" , mod_spec ) File \"/usr/local/lib/python3.7/runpy.py\" , line 85 , in _run_code exec ( code, run_globals ) File \"/sklearnserver/sklearnserver/__main__.py\" , line 33 , in <module> model.load () File \"/sklearnserver/sklearnserver/model.py\" , line 36 , in load model_file = next ( path for path in paths if os.path.exists ( path )) StopIteration","title":"Inference Service fails to start"},{"location":"developer/debug/#inference-service-cannot-fetch-docker-images-from-aws-ecr","text":"If you don't see the inference service created at all for custom images from private registries (such as AWS ECR), it might be that the Knative Serving Controller fails to authenticate itself against the registry. failed to resolve image to digest: failed to fetch image information: unsupported status code 401 ; body: Not Authorized You can verify that this is actually the case by spinning up a pod that uses your image. The pod should be able to fetch it, if the correct IAM roles are attached, while Knative is not able to. To circumvent this issue you can either skip tag resolution or provide certificates for your registry as detailed in the official knative docs . kubectl -n knative-serving edit configmap config-deployment The resultant yaml will look like something below. apiVersion : v1 kind : ConfigMap metadata : name : config-deployment namespace : knative-serving data : # List of repositories for which tag to digest resolving should be skipped (for AWS ECR: {account_id}.dkr.ecr.{region}.amazonaws.com) registriesSkippingTagResolving : registry.example.com","title":"Inference Service cannot fetch docker images from AWS ECR"},{"location":"developer/debug/#debug-kserve-request-flow","text":"+----------------------+ +-----------------------+ +--------------------------+ |Istio Virtual Service | |Istio Virtual Service | | K8S Service | | | | | | | |sklearn-iris | |sklearn-iris-predictor | | sklearn-iris-predictor | | +------->|-default +----->| -default-$revision | | | | | | | |KServe Route | |Knative Route | | Knative Revision Service | +----------------------+ +-----------------------+ +------------+-------------+ Knative Ingress Gateway Knative Local Gateway Kube Proxy (Istio gateway) (Istio gateway) | | | +-------------------------------------------------------+ | | Knative Revision Pod | | | | | | +-------------------+ +-----------------+ | | | | | | | | | | |kserve-container |<-----+ Queue Proxy | |<------------------+ | | | | | | | +-------------------+ +--------------^--+ | | | | +-----------------------^-------------------------------+ | scale deployment | +--------+--------+ | pull metrics | Knative | | | Autoscaler |----------- | KPA/HPA | +-----------------+","title":"Debug KServe Request flow"},{"location":"developer/debug/#1traffic-arrives-through-knative-ingresslocal-gateway-for-externalinternal-traffic","text":"Istio Gateway resource describes the edge of the mesh receiving incoming or outgoing HTTP/TCP connections. The specification describes a set of ports that should be exposed and the type of protocol to use. If you are using Standalone mode, it installs the Gateway in knative-serving namespace, if you are using Kubeflow KServe (KServe installed with Kubeflow), it installs the Gateway in kubeflow namespace e.g on GCP the gateway is protected behind IAP with Istio authentication policy . kubectl get gateway knative-ingress-gateway -n knative-serving -oyaml kind : Gateway metadata : labels : networking.knative.dev/ingress-provider : istio serving.knative.dev/release : v0.12.1 name : knative-ingress-gateway namespace : knative-serving spec : selector : istio : ingressgateway servers : - hosts : - '*' port : name : http number : 80 protocol : HTTP - hosts : - '*' port : name : https number : 443 protocol : HTTPS tls : mode : SIMPLE privateKey : /etc/istio/ingressgateway-certs/tls.key serverCertificate : /etc/istio/ingressgateway-certs/tls.crt The InferenceService request routes to the Istio Ingress Gateway by matching the host and port from the url, by default http is configured, you can configure HTTPS with TLS certificates .","title":"1.Traffic arrives through Knative Ingress/Local Gateway for external/internal traffic"},{"location":"developer/debug/#2-kserve-istio-virtual-service-to-route-for-predictor-transformer-explainer","text":"kubectl get vs sklearn-iris -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris namespace : default gateways : - knative-serving/knative-local-gateway - knative-serving/knative-ingress-gateway hosts : - sklearn-iris.default.svc.cluster.local - sklearn-iris.default.example.com http : - headers : request : set : Host : sklearn-iris-predictor-default.default.svc.cluster.local match : - authority : regex : ^sklearn-iris\\.default(\\.svc(\\.cluster\\.local)?)?(?::\\d{1,5})?$ gateways : - knative-serving/knative-local-gateway - authority : regex : ^sklearn-iris\\.default\\.example\\.com(?::\\d{1,5})?$ gateways : - knative-serving/knative-ingress-gateway route : - destination : host : knative-local-gateway.istio-system.svc.cluster.local port : number : 80 weight : 100 KServe creates the routing rule which by default routes to Predictor if you only have Predictor specified on InferenceService . When Transformer and Explainer are specified on InferenceService the routing rule configures the traffic to route to Transformer or Explainer based on the verb. The request then routes to the second level Knative created virtual service via local gateway with the matching host header.","title":"2. KServe Istio virtual service to route for predictor, transformer, explainer."},{"location":"developer/debug/#3-knative-istio-virtual-service-to-route-the-inference-request-to-the-latest-ready-revision","text":"kubectl get vs sklearn-iris-predictor-default-ingress -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris-predictor-default-mesh namespace : default spec : gateways : - knative-serving/knative-ingress-gateway - knative-serving/knative-local-gateway hosts : - sklearn-iris-predictor-default.default - sklearn-iris-predictor-default.default.example.com - sklearn-iris-predictor-default.default.svc - sklearn-iris-predictor-default.default.svc.cluster.local http : - match : - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default.svc gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 - match : - authority : prefix : sklearn-iris-predictor-default.default.example.com gateways : - knative-serving/knative-ingress-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 The destination here is the k8s Service for the latest ready Knative Revision and it is reconciled by Knative every time user rolls out a new revision. When a new revision is rolled out and in ready state, the old revision is then scaled down, after configured revision GC time the revision resource is garbage collected if the revision no longer has traffic referenced.","title":"3. Knative Istio virtual service to route the inference request to the latest ready revision."},{"location":"developer/debug/#4-kubernetes-service-routes-the-requests-to-the-queue-proxy-sidecar-of-the-inference-service-pod-on-port-8012","text":"kubectl get svc sklearn-iris-predictor-default-fhmjk-private -oyaml apiVersion : v1 kind : Service metadata : name : sklearn-iris-predictor-default-fhmjk-private namespace : default spec : clusterIP : 10.105.186.18 ports : - name : http port : 80 protocol : TCP targetPort : 8012 - name : queue-metrics port : 9090 protocol : TCP targetPort : queue-metrics - name : http-usermetric port : 9091 protocol : TCP targetPort : http-usermetric - name : http-queueadm port : 8022 protocol : TCP targetPort : 8022 selector : serving.knative.dev/revisionUID : a8f1eafc-3c64-4930-9a01-359f3235333a sessionAffinity : None type : ClusterIP","title":"4. Kubernetes Service routes the requests to the queue proxy sidecar of the inference service pod on port 8012."},{"location":"developer/debug/#5-the-queue-proxy-routes-to-kserve-container-with-max-concurrent-requests-configured-with-containerconcurrency","text":"If the queue proxy has more requests than it can handle, the Knative Autoscaler creates more pods to handle additional requests.","title":"5. The queue proxy routes to kserve container with max concurrent requests configured with ContainerConcurrency."},{"location":"developer/debug/#6-finally-the-queue-proxy-routes-traffic-to-the-kserve-container-for-processing-the-inference-requests","text":"","title":"6. Finally The queue proxy routes traffic to the kserve-container for processing the inference requests."},{"location":"developer/developer/","text":"Development \u00b6 This doc explains how to setup a development environment so you can get started contributing . Prerequisites \u00b6 Follow the instructions below to set up your development environment. Once you meet these requirements, you can make changes and deploy your own version of kserve ! Before submitting a PR, see also CONTRIBUTING.md . Install requirements \u00b6 You must install these tools: go : KServe controller is written in Go and requires Go 1.20.0+. git : For source control. Go Module : Go's new dependency management system. ko : For development. kubectl : For managing development environments. kustomize To customize YAMLs for different environments, requires v5.0.0+. yq yq is used in the project makefiles to parse and display YAML output, requires yq 4.* . Install Knative on a Kubernetes cluster \u00b6 KServe currently requires Knative Serving for auto-scaling, canary rollout, Istio for traffic routing and ingress. To install Knative components on your Kubernetes cluster, follow the installation guide or alternatively, use the Knative Operators to manage your installation. Observability, tracing and logging are optional but are often very valuable tools for troubleshooting difficult issues, they can be installed via the directions here . If you start from scratch, KServe requires Kubernetes 1.25+, Knative 1.7+, Istio 1.15+. If you already have Istio or Knative (e.g. from a Kubeflow install) then you don't need to install them explicitly, as long as version dependencies are satisfied. Note On a local environment, when using minikube or kind as Kubernetes cluster, there has been a reported issue that knative quickstart bootstrap does not work as expected. It is recommended to follow the installation manual from knative using yaml or using knative operator for a better result. Setup your environment \u00b6 To start your environment you'll need to set these environment variables (we recommend adding them to your .bashrc ): GOPATH : If you don't have one, simply pick a directory and add export GOPATH=... $GOPATH/bin on PATH : This is so that tooling installed via go get will work properly. KO_DEFAULTPLATFORMS : If you are using M1 Mac book the value is linux/arm64 . KO_DOCKER_REPO : The docker repository to which developer images should be pushed (e.g. docker.io/<username> ). Note : Set up a docker repository for pushing images. You can use any container image registry by adjusting the authentication methods and repository paths mentioned in the sections below. Google Container Registry quickstart Docker Hub quickstart Azure Container Registry quickstart Note if you are using docker hub to store your images your KO_DOCKER_REPO variable should be docker.io/<username> . Currently Docker Hub doesn't let you create subdirs under your username. .bashrc example: export GOPATH = \" $HOME /go\" export PATH = \" ${ PATH } : ${ GOPATH } /bin\" export KO_DOCKER_REPO = 'docker.io/<username>' Checkout your fork \u00b6 The Go tools require that you clone the repository to the src/github.com/kserve/kserve directory in your GOPATH . To check out this repository: Create your own fork of this repo Clone it to your machine: mkdir -p ${ GOPATH } /src/github.com/kserve cd ${ GOPATH } /src/github.com/kserve git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /kserve.git cd kserve git remote add upstream git@github.com:kserve/kserve.git git remote set-url --push upstream no_push Adding the upstream remote sets you up nicely for regularly syncing your fork . Once you reach this point you are ready to do a full build and deploy as described below. Deploy KServe \u00b6 Check Knative Serving installation \u00b6 Once you've setup your development environment , you can verify the installation with following: Success $ kubectl -n knative-serving get pods NAME READY STATUS RESTARTS AGE activator-77784645fc-t2pjf 1 /1 Running 0 11d autoscaler-6fddf74d5-z2fzf 1 /1 Running 0 11d autoscaler-hpa-5bf4476cc5-tsbw6 1 /1 Running 0 11d controller-7b8cd7f95c-6jxxj 1 /1 Running 0 11d istio-webhook-866c5bc7f8-t5ztb 1 /1 Running 0 11d networking-istio-54fb8b5d4b-xznwd 1 /1 Running 0 11d webhook-5f5f7bd9b4-cv27c 1 /1 Running 0 11d $ kubectl get gateway -n knative-serving NAME AGE knative-ingress-gateway 11d knative-local-gateway 11d $ kubectl get svc -n istio-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .101.196.89 X.X.X.X 15021 :31101/TCP,80:31781/TCP,443:30372/TCP,15443:31067/TCP 11d istiod ClusterIP 10 .101.116.203 <none> 15010 /TCP,15012/TCP,443/TCP,15014/TCP,853/TCP 11d Deploy KServe from master branch \u00b6 We suggest using cert manager for provisioning the certificates for the webhook server. Other solutions should also work as long as they put the certificates in the desired location. You can follow the cert manager documentation to install it. If you don't want to install cert manager, you can set the KSERVE_ENABLE_SELF_SIGNED_CA environment variable to true. KSERVE_ENABLE_SELF_SIGNED_CA will execute a script to create a self-signed CA and patch it to the webhook config. export KSERVE_ENABLE_SELF_SIGNED_CA = true After that you can run following command to deploy KServe , you can skip above step if cert manager is already installed. make deploy Optional you can change CPU and memory limits when deploying KServe . export KSERVE_CONTROLLER_CPU_LIMIT = <cpu_limit> export KSERVE_CONTROLLER_MEMORY_LIMIT = <memory_limit> make deploy Expected Output $ kubectl get pods -n kserve -l control-plane = kserve-controller-manager NAME READY STATUS RESTARTS AGE kserve-controller-manager-0 2/2 Running 0 13m Note By default it installs to kserve namespace with the published controller manager image from master branch. Deploy KServe with your own version \u00b6 Run the following command to deploy KServe controller and model agent with your local change. make deploy-dev Note deploy-dev builds the image from your local code, publishes to KO_DOCKER_REPO and deploys the kserve-controller-manager and model agent with the image digest to your cluster for testing. Please also ensure you are logged in to KO_DOCKER_REPO from your client machine. Run the following command to deploy model server with your local change. make deploy-dev-sklearn make deploy-dev-xgb Run the following command to deploy explainer with your local change. make deploy-dev-alibi Run the following command to deploy storage initializer with your local change. make deploy-dev-storageInitializer Warning The deploy command publishes the image to KO_DOCKER_REPO with the version latest , it changes the InferenceService configmap to point to the newly built image sha. The built image is only for development and testing purpose, the current limitation is that it changes the image impacted and reset all other images including the kserver-controller-manager to use the default ones. Smoke test after deployment \u00b6 Run the following command to smoke test the deployment kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/tensorflow/tensorflow.yaml You should see model serving deployment running under default or your specified namespace. $ kubectl get pods -n default -l serving.kserve.io/inferenceservice=flower-sample Expected Output NAME READY STATUS RESTARTS AGE flower-sample-default-htz8r-deployment-8fd979f9b-w2qbv 3/3 Running 0 10s Running unit/integration tests \u00b6 kserver-controller-manager has a few integration tests which requires mock apiserver and etcd, they get installed along with kubebuilder . To run all unit/integration tests: make test Run e2e tests locally \u00b6 To setup from local code, do: ./hack/quick_install.sh make undeploy make deploy-dev Go to python/kserve and install kserve python sdk deps pip3 install -e . [ test ] Then go to test/e2e . Run kubectl create namespace kserve-ci-e2e-test For KIND/minikube: Run export KSERVE_INGRESS_HOST_PORT=localhost:8080 In a different window run kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80 Note that not all tests will pass as the pytorch test requires gpu. These will show as pending pods at the end or you can add marker to skip the test. Run pytest > testresults.txt Tests may not clean up. To re-run, first do kubectl delete namespace kserve-ci-e2e-test , recreate namespace and run again. Iterating \u00b6 As you make changes to the code-base, there are two special cases to be aware of: If you change an input to generated code , then you must run make manifests . Inputs include: API type definitions in apis/serving Manifests or kustomize patches stored in config . To generate the KServe python/go clients, you should run make generate . If you want to add new dependencies , then you add the imports and the specific version of the dependency module in go.mod . When it encounters an import of a package not provided by any module in go.mod , the go command automatically looks up the module containing the package and adds it to go.mod using the latest version. If you want to upgrade the dependency , then you run go get command e.g go get golang.org/x/text to upgrade to the latest version, go get golang.org/x/text@v0.3.0 to upgrade to a specific version. make deploy-dev Contribute to the code \u00b6 See the guidelines for contributing a feature contributing to an existing issue Releases \u00b6 Please check out the documentation here to understand the release schedule and process. Feedback \u00b6 The best place to provide feedback about the KServe code is via a Github issue. See creating a Github issue for guidelines on submitting bugs and feature requests.","title":"How to contribute"},{"location":"developer/developer/#development","text":"This doc explains how to setup a development environment so you can get started contributing .","title":"Development"},{"location":"developer/developer/#prerequisites","text":"Follow the instructions below to set up your development environment. Once you meet these requirements, you can make changes and deploy your own version of kserve ! Before submitting a PR, see also CONTRIBUTING.md .","title":"Prerequisites"},{"location":"developer/developer/#install-requirements","text":"You must install these tools: go : KServe controller is written in Go and requires Go 1.20.0+. git : For source control. Go Module : Go's new dependency management system. ko : For development. kubectl : For managing development environments. kustomize To customize YAMLs for different environments, requires v5.0.0+. yq yq is used in the project makefiles to parse and display YAML output, requires yq 4.* .","title":"Install requirements"},{"location":"developer/developer/#install-knative-on-a-kubernetes-cluster","text":"KServe currently requires Knative Serving for auto-scaling, canary rollout, Istio for traffic routing and ingress. To install Knative components on your Kubernetes cluster, follow the installation guide or alternatively, use the Knative Operators to manage your installation. Observability, tracing and logging are optional but are often very valuable tools for troubleshooting difficult issues, they can be installed via the directions here . If you start from scratch, KServe requires Kubernetes 1.25+, Knative 1.7+, Istio 1.15+. If you already have Istio or Knative (e.g. from a Kubeflow install) then you don't need to install them explicitly, as long as version dependencies are satisfied. Note On a local environment, when using minikube or kind as Kubernetes cluster, there has been a reported issue that knative quickstart bootstrap does not work as expected. It is recommended to follow the installation manual from knative using yaml or using knative operator for a better result.","title":"Install Knative on a Kubernetes cluster"},{"location":"developer/developer/#setup-your-environment","text":"To start your environment you'll need to set these environment variables (we recommend adding them to your .bashrc ): GOPATH : If you don't have one, simply pick a directory and add export GOPATH=... $GOPATH/bin on PATH : This is so that tooling installed via go get will work properly. KO_DEFAULTPLATFORMS : If you are using M1 Mac book the value is linux/arm64 . KO_DOCKER_REPO : The docker repository to which developer images should be pushed (e.g. docker.io/<username> ). Note : Set up a docker repository for pushing images. You can use any container image registry by adjusting the authentication methods and repository paths mentioned in the sections below. Google Container Registry quickstart Docker Hub quickstart Azure Container Registry quickstart Note if you are using docker hub to store your images your KO_DOCKER_REPO variable should be docker.io/<username> . Currently Docker Hub doesn't let you create subdirs under your username. .bashrc example: export GOPATH = \" $HOME /go\" export PATH = \" ${ PATH } : ${ GOPATH } /bin\" export KO_DOCKER_REPO = 'docker.io/<username>'","title":"Setup your environment"},{"location":"developer/developer/#checkout-your-fork","text":"The Go tools require that you clone the repository to the src/github.com/kserve/kserve directory in your GOPATH . To check out this repository: Create your own fork of this repo Clone it to your machine: mkdir -p ${ GOPATH } /src/github.com/kserve cd ${ GOPATH } /src/github.com/kserve git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /kserve.git cd kserve git remote add upstream git@github.com:kserve/kserve.git git remote set-url --push upstream no_push Adding the upstream remote sets you up nicely for regularly syncing your fork . Once you reach this point you are ready to do a full build and deploy as described below.","title":"Checkout your fork"},{"location":"developer/developer/#deploy-kserve","text":"","title":"Deploy KServe"},{"location":"developer/developer/#check-knative-serving-installation","text":"Once you've setup your development environment , you can verify the installation with following: Success $ kubectl -n knative-serving get pods NAME READY STATUS RESTARTS AGE activator-77784645fc-t2pjf 1 /1 Running 0 11d autoscaler-6fddf74d5-z2fzf 1 /1 Running 0 11d autoscaler-hpa-5bf4476cc5-tsbw6 1 /1 Running 0 11d controller-7b8cd7f95c-6jxxj 1 /1 Running 0 11d istio-webhook-866c5bc7f8-t5ztb 1 /1 Running 0 11d networking-istio-54fb8b5d4b-xznwd 1 /1 Running 0 11d webhook-5f5f7bd9b4-cv27c 1 /1 Running 0 11d $ kubectl get gateway -n knative-serving NAME AGE knative-ingress-gateway 11d knative-local-gateway 11d $ kubectl get svc -n istio-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .101.196.89 X.X.X.X 15021 :31101/TCP,80:31781/TCP,443:30372/TCP,15443:31067/TCP 11d istiod ClusterIP 10 .101.116.203 <none> 15010 /TCP,15012/TCP,443/TCP,15014/TCP,853/TCP 11d","title":"Check Knative Serving installation"},{"location":"developer/developer/#deploy-kserve-from-master-branch","text":"We suggest using cert manager for provisioning the certificates for the webhook server. Other solutions should also work as long as they put the certificates in the desired location. You can follow the cert manager documentation to install it. If you don't want to install cert manager, you can set the KSERVE_ENABLE_SELF_SIGNED_CA environment variable to true. KSERVE_ENABLE_SELF_SIGNED_CA will execute a script to create a self-signed CA and patch it to the webhook config. export KSERVE_ENABLE_SELF_SIGNED_CA = true After that you can run following command to deploy KServe , you can skip above step if cert manager is already installed. make deploy Optional you can change CPU and memory limits when deploying KServe . export KSERVE_CONTROLLER_CPU_LIMIT = <cpu_limit> export KSERVE_CONTROLLER_MEMORY_LIMIT = <memory_limit> make deploy Expected Output $ kubectl get pods -n kserve -l control-plane = kserve-controller-manager NAME READY STATUS RESTARTS AGE kserve-controller-manager-0 2/2 Running 0 13m Note By default it installs to kserve namespace with the published controller manager image from master branch.","title":"Deploy KServe from master branch"},{"location":"developer/developer/#deploy-kserve-with-your-own-version","text":"Run the following command to deploy KServe controller and model agent with your local change. make deploy-dev Note deploy-dev builds the image from your local code, publishes to KO_DOCKER_REPO and deploys the kserve-controller-manager and model agent with the image digest to your cluster for testing. Please also ensure you are logged in to KO_DOCKER_REPO from your client machine. Run the following command to deploy model server with your local change. make deploy-dev-sklearn make deploy-dev-xgb Run the following command to deploy explainer with your local change. make deploy-dev-alibi Run the following command to deploy storage initializer with your local change. make deploy-dev-storageInitializer Warning The deploy command publishes the image to KO_DOCKER_REPO with the version latest , it changes the InferenceService configmap to point to the newly built image sha. The built image is only for development and testing purpose, the current limitation is that it changes the image impacted and reset all other images including the kserver-controller-manager to use the default ones.","title":"Deploy KServe with your own version"},{"location":"developer/developer/#smoke-test-after-deployment","text":"Run the following command to smoke test the deployment kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/tensorflow/tensorflow.yaml You should see model serving deployment running under default or your specified namespace. $ kubectl get pods -n default -l serving.kserve.io/inferenceservice=flower-sample Expected Output NAME READY STATUS RESTARTS AGE flower-sample-default-htz8r-deployment-8fd979f9b-w2qbv 3/3 Running 0 10s","title":"Smoke test after deployment"},{"location":"developer/developer/#running-unitintegration-tests","text":"kserver-controller-manager has a few integration tests which requires mock apiserver and etcd, they get installed along with kubebuilder . To run all unit/integration tests: make test","title":"Running unit/integration tests"},{"location":"developer/developer/#run-e2e-tests-locally","text":"To setup from local code, do: ./hack/quick_install.sh make undeploy make deploy-dev Go to python/kserve and install kserve python sdk deps pip3 install -e . [ test ] Then go to test/e2e . Run kubectl create namespace kserve-ci-e2e-test For KIND/minikube: Run export KSERVE_INGRESS_HOST_PORT=localhost:8080 In a different window run kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80 Note that not all tests will pass as the pytorch test requires gpu. These will show as pending pods at the end or you can add marker to skip the test. Run pytest > testresults.txt Tests may not clean up. To re-run, first do kubectl delete namespace kserve-ci-e2e-test , recreate namespace and run again.","title":"Run e2e tests locally"},{"location":"developer/developer/#iterating","text":"As you make changes to the code-base, there are two special cases to be aware of: If you change an input to generated code , then you must run make manifests . Inputs include: API type definitions in apis/serving Manifests or kustomize patches stored in config . To generate the KServe python/go clients, you should run make generate . If you want to add new dependencies , then you add the imports and the specific version of the dependency module in go.mod . When it encounters an import of a package not provided by any module in go.mod , the go command automatically looks up the module containing the package and adds it to go.mod using the latest version. If you want to upgrade the dependency , then you run go get command e.g go get golang.org/x/text to upgrade to the latest version, go get golang.org/x/text@v0.3.0 to upgrade to a specific version. make deploy-dev","title":"Iterating"},{"location":"developer/developer/#contribute-to-the-code","text":"See the guidelines for contributing a feature contributing to an existing issue","title":"Contribute to the code"},{"location":"developer/developer/#releases","text":"Please check out the documentation here to understand the release schedule and process.","title":"Releases"},{"location":"developer/developer/#feedback","text":"The best place to provide feedback about the KServe code is via a Github issue. See creating a Github issue for guidelines on submitting bugs and feature requests.","title":"Feedback"},{"location":"get_started/","text":"Getting Started with KServe \u00b6 Before you begin \u00b6 Warning KServe Quickstart Environments are for experimentation use only. For production installation, see our Administrator's Guide Before you can get started with a KServe Quickstart deployment you must install kind and the Kubernetes CLI. Install Kind (Kubernetes in Docker) \u00b6 You can use kind (Kubernetes in Docker) to run a local Kubernetes cluster with Docker container nodes. Install the Kubernetes CLI \u00b6 The Kubernetes CLI ( kubectl ) , allows you to run commands against Kubernetes clusters. You can use kubectl to deploy applications, inspect and manage cluster resources, and view logs. Install the KServe \"Quickstart\" environment \u00b6 After having kind installed, create a kind cluster with: kind create cluster Then run: kubectl config get-contexts It should list out a list of contexts you have, one of them should be kind-kind . Then run: kubectl config use-context kind-kind to use this context. You can then get started with a local deployment of KServe by using KServe Quick installation script on Kind : curl -s \"https://raw.githubusercontent.com/kserve/kserve/release-0.13/hack/quick_install.sh\" | bash or install via our published Helm Charts: helm install kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.13.0 helm install kserve oci://ghcr.io/kserve/charts/kserve --version v0.13.0","title":"KServe Quickstart"},{"location":"get_started/#getting-started-with-kserve","text":"","title":"Getting Started with KServe"},{"location":"get_started/#before-you-begin","text":"Warning KServe Quickstart Environments are for experimentation use only. For production installation, see our Administrator's Guide Before you can get started with a KServe Quickstart deployment you must install kind and the Kubernetes CLI.","title":"Before you begin"},{"location":"get_started/#install-kind-kubernetes-in-docker","text":"You can use kind (Kubernetes in Docker) to run a local Kubernetes cluster with Docker container nodes.","title":"Install Kind (Kubernetes in Docker)"},{"location":"get_started/#install-the-kubernetes-cli","text":"The Kubernetes CLI ( kubectl ) , allows you to run commands against Kubernetes clusters. You can use kubectl to deploy applications, inspect and manage cluster resources, and view logs.","title":"Install the Kubernetes CLI"},{"location":"get_started/#install-the-kserve-quickstart-environment","text":"After having kind installed, create a kind cluster with: kind create cluster Then run: kubectl config get-contexts It should list out a list of contexts you have, one of them should be kind-kind . Then run: kubectl config use-context kind-kind to use this context. You can then get started with a local deployment of KServe by using KServe Quick installation script on Kind : curl -s \"https://raw.githubusercontent.com/kserve/kserve/release-0.13/hack/quick_install.sh\" | bash or install via our published Helm Charts: helm install kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.13.0 helm install kserve oci://ghcr.io/kserve/charts/kserve --version v0.13.0","title":"Install the KServe \"Quickstart\" environment"},{"location":"get_started/first_isvc/","text":"Run your first InferenceService \u00b6 In this tutorial, you will deploy an InferenceService with a predictor that will load a scikit-learn model trained with the iris dataset. This dataset has three output classes: Iris Setosa, Iris Versicolour, and Iris Virginica. You will then send an inference request to your deployed model in order to get a prediction for the class of iris plant your request corresponds to. Since your model is being deployed as an InferenceService, not a raw Kubernetes Service, you just need to provide the storage location of the model and it gets some super powers out of the box . 1. Create a namespace \u00b6 First, create a namespace to use for deploying KServe resources: kubectl create namespace kserve-test 2. Create an InferenceService \u00b6 Next, define a new InferenceService YAML for the model and apply it to the cluster. A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: sklearn: storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF Warning Do not deploy InferenceServices in control plane namespaces (i.e. namespaces with control-plane label). The webhook is configured in a way to skip these namespaces to avoid any privilege escalations. Deploying InferenceServices to these namespaces will result in the storage initializer not being injected into the pod, causing the pod to fail with the error No such file or directory: '/mnt/models' . 3. Check InferenceService status. \u00b6 kubectl get inferenceservices sklearn-iris -n kserve-test Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-47q2g 7d23h If your DNS contains example.com please consult your admin for configuring DNS or using custom domain . 4. Determine the ingress IP and ports \u00b6 Execute the following command to determine if your kubernetes cluster is running in an environment that supports external load balancers kubectl get svc istio-ingressgateway -n istio-system Expected Output NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 172 .21.109.129 130 .211.10.121 ... 17h Load Balancer Node Port Port Forward If the EXTERNAL-IP value is set, your environment has an external load balancer that you can use for the ingress gateway. export INGRESS_HOST = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.status.loadBalancer.ingress[0].ip}' ) export INGRESS_PORT = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.spec.ports[?(@.name==\"http2\")].port}' ) If the EXTERNAL-IP value is none (or perpetually pending), your environment does not provide an external load balancer for the ingress gateway. In this case, you can access the gateway using the service\u2019s node port. # GKE export INGRESS_HOST = worker-node-address # Minikube export INGRESS_HOST = $( minikube ip ) # Other environment(On Prem) export INGRESS_HOST = $( kubectl get po -l istio = ingressgateway -n istio-system -o jsonpath = '{.items[0].status.hostIP}' ) export INGRESS_PORT = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.spec.ports[?(@.name==\"http2\")].nodePort}' ) Alternatively you can do Port Forward for testing purposes. INGRESS_GATEWAY_SERVICE = $( kubectl get svc --namespace istio-system --selector = \"app=istio-ingressgateway\" --output jsonpath = '{.items[0].metadata.name}' ) kubectl port-forward --namespace istio-system svc/ ${ INGRESS_GATEWAY_SERVICE } 8080 :80 Open another terminal, and enter the following: export INGRESS_HOST = localhost export INGRESS_PORT = 8080 5. Perform inference \u00b6 First, prepare your inference input request inside a file: cat <<EOF > \"./iris-input.json\" { \"instances\": [ [6.8, 2.8, 4.8, 1.4], [6.0, 3.4, 4.5, 1.6] ] } EOF Depending on your setup, use one of the following commands to curl the InferenceService : Real DNS Magic DNS From Ingress gateway with HOST Header From local cluster gateway If you have configured the DNS, you can directly curl the InferenceService with the URL obtained from the status print. e.g curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json If you don't want to go through the trouble to get a real domain, you can instead use \"magic\" dns xip.io . The key is to get the external IP for your cluster. kubectl get svc istio-ingressgateway --namespace istio-system Look for the EXTERNAL-IP column's value(in this case 35.237.217.209) NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .51.253.94 35 .237.217.209 Next step is to setting up the custom domain: kubectl edit cm config-domain --namespace knative-serving Now in your editor, change example.com to {{external-ip}}.xip.io (make sure to replace {{external-ip}} with the IP you found earlier). With the change applied you can now directly curl the URL curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test.35.237.217.209.xip.io/v1/models/sklearn-iris:predict -d @./iris-input.json If you do not have DNS, you can still curl with the ingress gateway external IP using the HOST Header. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -n kserve-test -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/sklearn-iris:predict\" -d @./iris-input.json If you are calling from in cluster you can curl with the internal url with host {{ InferenceServiceName }}. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test/v1/models/sklearn-iris:predict -d @./iris-input.json You should see two predictions returned (i.e. {\"predictions\": [1, 1]} ). Both sets of data points sent for inference correspond to the flower with index 1 . In this case, the model predicts that both flowers are \"Iris Versicolour\". 6. Run performance test (optional) \u00b6 If you want to load test the deployed model, try deploying the following Kubernetes Job to drive load to the model: # use kubectl create instead of apply because the job template is using generateName which doesn't work with kubectl apply kubectl create -f https://raw.githubusercontent.com/kserve/kserve/release-0.11/docs/samples/v1beta1/sklearn/v1/perf.yaml -n kserve-test Execute the following command to view output: kubectl logs load-test8b58n-rgfxr -n kserve-test Expected Output Requests [ total, rate, throughput ] 30000 , 500 .02, 499 .99 Duration [ total, attack, wait ] 1m0s, 59 .998s, 3 .336ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 1 .743ms, 2 .748ms, 2 .494ms, 3 .363ms, 4 .091ms, 7 .749ms, 46 .354ms Bytes In [ total, mean ] 690000 , 23 .00 Bytes Out [ total, mean ] 2460000 , 82 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :30000 Error Set:","title":"First InferenceService"},{"location":"get_started/first_isvc/#run-your-first-inferenceservice","text":"In this tutorial, you will deploy an InferenceService with a predictor that will load a scikit-learn model trained with the iris dataset. This dataset has three output classes: Iris Setosa, Iris Versicolour, and Iris Virginica. You will then send an inference request to your deployed model in order to get a prediction for the class of iris plant your request corresponds to. Since your model is being deployed as an InferenceService, not a raw Kubernetes Service, you just need to provide the storage location of the model and it gets some super powers out of the box .","title":"Run your first InferenceService"},{"location":"get_started/first_isvc/#1-create-a-namespace","text":"First, create a namespace to use for deploying KServe resources: kubectl create namespace kserve-test","title":"1. Create a namespace"},{"location":"get_started/first_isvc/#2-create-an-inferenceservice","text":"Next, define a new InferenceService YAML for the model and apply it to the cluster. A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: sklearn: storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF Warning Do not deploy InferenceServices in control plane namespaces (i.e. namespaces with control-plane label). The webhook is configured in a way to skip these namespaces to avoid any privilege escalations. Deploying InferenceServices to these namespaces will result in the storage initializer not being injected into the pod, causing the pod to fail with the error No such file or directory: '/mnt/models' .","title":"2. Create an InferenceService"},{"location":"get_started/first_isvc/#3-check-inferenceservice-status","text":"kubectl get inferenceservices sklearn-iris -n kserve-test Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-47q2g 7d23h If your DNS contains example.com please consult your admin for configuring DNS or using custom domain .","title":"3. Check InferenceService status."},{"location":"get_started/first_isvc/#4-determine-the-ingress-ip-and-ports","text":"Execute the following command to determine if your kubernetes cluster is running in an environment that supports external load balancers kubectl get svc istio-ingressgateway -n istio-system Expected Output NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 172 .21.109.129 130 .211.10.121 ... 17h Load Balancer Node Port Port Forward If the EXTERNAL-IP value is set, your environment has an external load balancer that you can use for the ingress gateway. export INGRESS_HOST = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.status.loadBalancer.ingress[0].ip}' ) export INGRESS_PORT = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.spec.ports[?(@.name==\"http2\")].port}' ) If the EXTERNAL-IP value is none (or perpetually pending), your environment does not provide an external load balancer for the ingress gateway. In this case, you can access the gateway using the service\u2019s node port. # GKE export INGRESS_HOST = worker-node-address # Minikube export INGRESS_HOST = $( minikube ip ) # Other environment(On Prem) export INGRESS_HOST = $( kubectl get po -l istio = ingressgateway -n istio-system -o jsonpath = '{.items[0].status.hostIP}' ) export INGRESS_PORT = $( kubectl -n istio-system get service istio-ingressgateway -o jsonpath = '{.spec.ports[?(@.name==\"http2\")].nodePort}' ) Alternatively you can do Port Forward for testing purposes. INGRESS_GATEWAY_SERVICE = $( kubectl get svc --namespace istio-system --selector = \"app=istio-ingressgateway\" --output jsonpath = '{.items[0].metadata.name}' ) kubectl port-forward --namespace istio-system svc/ ${ INGRESS_GATEWAY_SERVICE } 8080 :80 Open another terminal, and enter the following: export INGRESS_HOST = localhost export INGRESS_PORT = 8080","title":"4. Determine the ingress IP and ports"},{"location":"get_started/first_isvc/#5-perform-inference","text":"First, prepare your inference input request inside a file: cat <<EOF > \"./iris-input.json\" { \"instances\": [ [6.8, 2.8, 4.8, 1.4], [6.0, 3.4, 4.5, 1.6] ] } EOF Depending on your setup, use one of the following commands to curl the InferenceService : Real DNS Magic DNS From Ingress gateway with HOST Header From local cluster gateway If you have configured the DNS, you can directly curl the InferenceService with the URL obtained from the status print. e.g curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json If you don't want to go through the trouble to get a real domain, you can instead use \"magic\" dns xip.io . The key is to get the external IP for your cluster. kubectl get svc istio-ingressgateway --namespace istio-system Look for the EXTERNAL-IP column's value(in this case 35.237.217.209) NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .51.253.94 35 .237.217.209 Next step is to setting up the custom domain: kubectl edit cm config-domain --namespace knative-serving Now in your editor, change example.com to {{external-ip}}.xip.io (make sure to replace {{external-ip}} with the IP you found earlier). With the change applied you can now directly curl the URL curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test.35.237.217.209.xip.io/v1/models/sklearn-iris:predict -d @./iris-input.json If you do not have DNS, you can still curl with the ingress gateway external IP using the HOST Header. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -n kserve-test -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/sklearn-iris:predict\" -d @./iris-input.json If you are calling from in cluster you can curl with the internal url with host {{ InferenceServiceName }}. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test/v1/models/sklearn-iris:predict -d @./iris-input.json You should see two predictions returned (i.e. {\"predictions\": [1, 1]} ). Both sets of data points sent for inference correspond to the flower with index 1 . In this case, the model predicts that both flowers are \"Iris Versicolour\".","title":"5. Perform inference"},{"location":"get_started/first_isvc/#6-run-performance-test-optional","text":"If you want to load test the deployed model, try deploying the following Kubernetes Job to drive load to the model: # use kubectl create instead of apply because the job template is using generateName which doesn't work with kubectl apply kubectl create -f https://raw.githubusercontent.com/kserve/kserve/release-0.11/docs/samples/v1beta1/sklearn/v1/perf.yaml -n kserve-test Execute the following command to view output: kubectl logs load-test8b58n-rgfxr -n kserve-test Expected Output Requests [ total, rate, throughput ] 30000 , 500 .02, 499 .99 Duration [ total, attack, wait ] 1m0s, 59 .998s, 3 .336ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 1 .743ms, 2 .748ms, 2 .494ms, 3 .363ms, 4 .091ms, 7 .749ms, 46 .354ms Bytes In [ total, mean ] 690000 , 23 .00 Bytes Out [ total, mean ] 2460000 , 82 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :30000 Error Set:","title":"6. Run performance test (optional)"},{"location":"get_started/swagger_ui/","text":"InferenceService Swagger UI \u00b6 KServe ModelServer is built on top of FastAPI , which brings out-of-box support for OpenAPI specification and Swagger UI . Swagger UI allows visualizing and interacting with the KServe InferenceService API directly in the browser , making it easy for exploring the endpoints and validating the outputs without using any command-line tool. Enable Swagger UI \u00b6 Warning Be careful when enabling this for your production InferenceService deployments since the endpoint does not require authentication at this time. Currently, POST request only work for v2 endpoints in the UI. To enable, simply add an extra argument to the InferenceService YAML example from First Inference chapter: kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: args: [\"--enable_docs_url=True\"] modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF After the InferenceService becomes ready the Swagger UI will be served at /docs . In our example above, the Swagger UI will be available at http://sklearn-iris.kserve-test.example.com/docs . Interact with InferenceService \u00b6 Click one of the V2 endpoints like /v2 , it will expand and display the description and response from this API endpoint: Now, when you click \"Try it out\" and then \"Execute\", Swagger UI will send a GET request to the /v2 endpoint. The server response body and headers will be displayed at the bottom: Similarly, we can use Swagger UI to send request to check the model metadata and make prediction using the /v2/models/{model_name}/infer endpoint. For more reference, please check out Model Serving Data Plane for detailed documentation on the Inference Protocol.","title":"Interact with InferenceService Swagger UI"},{"location":"get_started/swagger_ui/#inferenceservice-swagger-ui","text":"KServe ModelServer is built on top of FastAPI , which brings out-of-box support for OpenAPI specification and Swagger UI . Swagger UI allows visualizing and interacting with the KServe InferenceService API directly in the browser , making it easy for exploring the endpoints and validating the outputs without using any command-line tool.","title":"InferenceService Swagger UI"},{"location":"get_started/swagger_ui/#enable-swagger-ui","text":"Warning Be careful when enabling this for your production InferenceService deployments since the endpoint does not require authentication at this time. Currently, POST request only work for v2 endpoints in the UI. To enable, simply add an extra argument to the InferenceService YAML example from First Inference chapter: kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: args: [\"--enable_docs_url=True\"] modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model\" EOF After the InferenceService becomes ready the Swagger UI will be served at /docs . In our example above, the Swagger UI will be available at http://sklearn-iris.kserve-test.example.com/docs .","title":"Enable Swagger UI"},{"location":"get_started/swagger_ui/#interact-with-inferenceservice","text":"Click one of the V2 endpoints like /v2 , it will expand and display the description and response from this API endpoint: Now, when you click \"Try it out\" and then \"Execute\", Swagger UI will send a GET request to the /v2 endpoint. The server response body and headers will be displayed at the bottom: Similarly, we can use Swagger UI to send request to check the model metadata and make prediction using the /v2/models/{model_name}/infer endpoint. For more reference, please check out Model Serving Data Plane for detailed documentation on the Inference Protocol.","title":"Interact with InferenceService"},{"location":"help/contributor/github/","text":"GitHub workflow for KServe documentation \u00b6 Learn how to use GitHub and contribute to the kserve/website repo. Set up your local machine \u00b6 To check out your fork of the kserve/website repository: Create your own fork of the kserve/website repo . Configure GitHub access through SSH . Clone your fork to your machine and set the upstream remote to the kserve/website repository: mkdir -p ${ GOPATH } /src/kserve.io cd ${ GOPATH } /src/kserve.io git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /website.git cd docs git remote add upstream https://github.com/kserve/website.git git remote set-url --push upstream no_push You are now able to open PRs, start reviews, and contribute fixes the kserve/website repo. See the following sections to learn more. Important : Remember to regularly syncing your fork . Report documentation issues \u00b6 KServe uses Github issues to track documentation issues and requests. If you see a problem with the documentation that you're not sure how to fix, submit an issue using the following steps: Check the KServe docs issues list before creating an issue to avoid making a duplicate. Use the correct template for your new issue. There are two templates available: Bug report : If you're reporting an error in the existing documentation, use this template. This could be anything from broken samples to typos. When you create a bug report, include as many details as possible and include suggested fixes to the issue. Feature request : For upcoming changes to the documentation or requests for more information on a particular subject. Open PRs to fix documentation issues \u00b6 The KServe documentation follows the standard GitHub collaboration flow for Pull Requests (PRs). Ensure that your fork is up-to-date . Create a branch in your fork . Locate or create the file that you want to fix: If you are updating an existing page, locate that file and begin making changes. For example, from any page on kserve.io , you can click the pencil icon in the upper right corner to open that page in GitHub. If you are adding new content, you must follow the \"new docs\" instructions. To edit a file, use the new branch that you created in your fork. Navigate to that same file within your fork using the GitHub UI. Open that file from in your local clone. Create the Pull Request in the kserve/website repo . Assign an owner to the PR to request a review. Here's what generally happens after you send the PR for review: One of the assigned repo maintainers will triage the PR by assigning relative priority, adding appropriate labels, and performing an initial documentation review. If the PR involves significant technical changes, for example new features, or new and changed sample code, the PR is assigned to a Subject Matter Expert (SME), typically an engineer working on KServe, for technical review and approval. When both the technical writers and SMEs are satisfied with the quality of the writing and the technical accuracy of the content, the PR can be merged. A PR requires two labels before it can merge: lgtm and approved . The SME is responsible for reviewing the technical accuracy and adding the lgtm label. The KServe technical writers are who provide the approved label when the content meets quality, clarity, and organization standards (see Style Guide ). We appreciate contributions to the docs, so if you open a PR we will help you get it merged. Assigning owners and reviewers \u00b6 All PRs should be assigned to a single owner (\" Assignee \"). It's best to set the \"Assignee\" and include other stakeholders as \"Reviewers\" rather than leaving it unassigned or allowing Prow to auto assign reviewers. Use the /assign command to set the owner. For example: /assign @owner_id For code related changes , initially set the owner of your PR to the SME who should review for technical accuracy. If you don't know who the appropriate owner is, nor who your reviewers should be for your PR, you can assign the current working group lead of the related component. If you want to notify and include other stakeholders in your PR review, use the /cc command. For example: /cc @stakeholder_id1 @stakeholder_id2 Reviewing PRs \u00b6 See the KServe community guidelines about reviewing PRs Using Prow to manage PRs and Issues \u00b6 KServe uses several sets of tools to manage pull requests (PR)s and issues in a more fine-grained way than GitHub permissions allow. In particular, you'll regularly interact with Prow to categorize and manage issues and PRs. Prow allows control of specific GitHub functionality without granting full \"write\" access to the repo (which would allow rewriting history and other dangerous operations). You'll most often use the following commands, but Prow will also chime in on most bugs and PRs with a link to all the known commands: /assign @user1 @user2 to assign an issue or PR to specific people for review or approval. /lgtm and /approve to approve a PR. Note that anyone may /lgtm a PR, but only someone listed in an OWNERS file may /approve the PR. A PR needs both an approval and an LGTM -- the /lgtm review is a good opportunity for non-approvers to practice and develop reviewing skills. /lgtm is removed when a PR is updated, but /approve is sticky -- once applied, anyone can supply an /lgtm . Both Prow (legacy) and GitHub actions (preferred) can run tests on PRs; once all tests are passing and a PR has the lgtm and approved labels, Prow will submit the PR automatically. You can also use Prow to manage labels on PRs with /kind ... , /good-first-issue , or /area ... See Branches for details about how to use the /cherrypick command. Common GitHub PRs FAQs \u00b6 One or more tests are failing. If you do not see a specific error related to a change you made, and instead the errors are related to timeouts, try re-running the test at a later time. There are running tasks that could result in timeouts or rate limiting if your test runs at the same time. Other Issues/Unsure -- reach out in the Slack channel and someone will be happy to help out.","title":"GitHub workflow for KServe documentation"},{"location":"help/contributor/github/#github-workflow-for-kserve-documentation","text":"Learn how to use GitHub and contribute to the kserve/website repo.","title":"GitHub workflow for KServe documentation"},{"location":"help/contributor/github/#set-up-your-local-machine","text":"To check out your fork of the kserve/website repository: Create your own fork of the kserve/website repo . Configure GitHub access through SSH . Clone your fork to your machine and set the upstream remote to the kserve/website repository: mkdir -p ${ GOPATH } /src/kserve.io cd ${ GOPATH } /src/kserve.io git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /website.git cd docs git remote add upstream https://github.com/kserve/website.git git remote set-url --push upstream no_push You are now able to open PRs, start reviews, and contribute fixes the kserve/website repo. See the following sections to learn more. Important : Remember to regularly syncing your fork .","title":"Set up your local machine"},{"location":"help/contributor/github/#report-documentation-issues","text":"KServe uses Github issues to track documentation issues and requests. If you see a problem with the documentation that you're not sure how to fix, submit an issue using the following steps: Check the KServe docs issues list before creating an issue to avoid making a duplicate. Use the correct template for your new issue. There are two templates available: Bug report : If you're reporting an error in the existing documentation, use this template. This could be anything from broken samples to typos. When you create a bug report, include as many details as possible and include suggested fixes to the issue. Feature request : For upcoming changes to the documentation or requests for more information on a particular subject.","title":"Report documentation issues"},{"location":"help/contributor/github/#open-prs-to-fix-documentation-issues","text":"The KServe documentation follows the standard GitHub collaboration flow for Pull Requests (PRs). Ensure that your fork is up-to-date . Create a branch in your fork . Locate or create the file that you want to fix: If you are updating an existing page, locate that file and begin making changes. For example, from any page on kserve.io , you can click the pencil icon in the upper right corner to open that page in GitHub. If you are adding new content, you must follow the \"new docs\" instructions. To edit a file, use the new branch that you created in your fork. Navigate to that same file within your fork using the GitHub UI. Open that file from in your local clone. Create the Pull Request in the kserve/website repo . Assign an owner to the PR to request a review. Here's what generally happens after you send the PR for review: One of the assigned repo maintainers will triage the PR by assigning relative priority, adding appropriate labels, and performing an initial documentation review. If the PR involves significant technical changes, for example new features, or new and changed sample code, the PR is assigned to a Subject Matter Expert (SME), typically an engineer working on KServe, for technical review and approval. When both the technical writers and SMEs are satisfied with the quality of the writing and the technical accuracy of the content, the PR can be merged. A PR requires two labels before it can merge: lgtm and approved . The SME is responsible for reviewing the technical accuracy and adding the lgtm label. The KServe technical writers are who provide the approved label when the content meets quality, clarity, and organization standards (see Style Guide ). We appreciate contributions to the docs, so if you open a PR we will help you get it merged.","title":"Open PRs to fix documentation issues"},{"location":"help/contributor/github/#assigning-owners-and-reviewers","text":"All PRs should be assigned to a single owner (\" Assignee \"). It's best to set the \"Assignee\" and include other stakeholders as \"Reviewers\" rather than leaving it unassigned or allowing Prow to auto assign reviewers. Use the /assign command to set the owner. For example: /assign @owner_id For code related changes , initially set the owner of your PR to the SME who should review for technical accuracy. If you don't know who the appropriate owner is, nor who your reviewers should be for your PR, you can assign the current working group lead of the related component. If you want to notify and include other stakeholders in your PR review, use the /cc command. For example: /cc @stakeholder_id1 @stakeholder_id2","title":"Assigning owners and reviewers"},{"location":"help/contributor/github/#reviewing-prs","text":"See the KServe community guidelines about reviewing PRs","title":"Reviewing PRs"},{"location":"help/contributor/github/#using-prow-to-manage-prs-and-issues","text":"KServe uses several sets of tools to manage pull requests (PR)s and issues in a more fine-grained way than GitHub permissions allow. In particular, you'll regularly interact with Prow to categorize and manage issues and PRs. Prow allows control of specific GitHub functionality without granting full \"write\" access to the repo (which would allow rewriting history and other dangerous operations). You'll most often use the following commands, but Prow will also chime in on most bugs and PRs with a link to all the known commands: /assign @user1 @user2 to assign an issue or PR to specific people for review or approval. /lgtm and /approve to approve a PR. Note that anyone may /lgtm a PR, but only someone listed in an OWNERS file may /approve the PR. A PR needs both an approval and an LGTM -- the /lgtm review is a good opportunity for non-approvers to practice and develop reviewing skills. /lgtm is removed when a PR is updated, but /approve is sticky -- once applied, anyone can supply an /lgtm . Both Prow (legacy) and GitHub actions (preferred) can run tests on PRs; once all tests are passing and a PR has the lgtm and approved labels, Prow will submit the PR automatically. You can also use Prow to manage labels on PRs with /kind ... , /good-first-issue , or /area ... See Branches for details about how to use the /cherrypick command.","title":"Using Prow to manage PRs and Issues"},{"location":"help/contributor/github/#common-github-prs-faqs","text":"One or more tests are failing. If you do not see a specific error related to a change you made, and instead the errors are related to timeouts, try re-running the test at a later time. There are running tasks that could result in timeouts or rate limiting if your test runs at the same time. Other Issues/Unsure -- reach out in the Slack channel and someone will be happy to help out.","title":"Common GitHub PRs FAQs"},{"location":"help/contributor/mkdocs-contributor-guide/","text":"MkDocs Contributions \u00b6 This is a temporary home for contribution guidelines for the MkDocs branch. When MkDocs becomes \"main\" this will be moved to the appropriate place on the website Install Material for MkDocs \u00b6 kserve.io uses Material for MkDocs to render documentation. Material for MkDocs is Python based and uses pip to install most of it's required packages as well as optional add-ons (which we use). You can choose to install MkDocs locally or using a Docker image. pip actually comes pre-installed with Python so it is included in many operating systems (like MacOSx or Ubuntu) but if you don\u2019t have Python, you can install it here: https://www.python.org For some (e.g. folks using RHEL), you may have to use pip3. pip pip3 pip install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation pip3 install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation Install KServe-Specific Extensions \u00b6 KServe uses a number of extensions to MkDocs which can also be installed using pip. If you used pip to install, run the following: pip pip3 pip install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects pip3 install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects Install Dependencies in Requirements.txt file \u00b6 Navigate to root folder and run below command to install required packages and libraries specified in the requirements.txt file. pip pip3 pip install -r requirements.txt pip3 install -r requirements.txt Setting Up Local Preview \u00b6 Once you have installed Material for MkDocs and all of the extensions, head over to and clone the repo. In your terminal, find your way over to the location of the cloned repo. Once you are in the main folder and run: Local Preview Local Preview w/ Dirty Reload Local Preview including Blog and Community Site mkdocs serve If you\u2019re only changing a single page in the /docs/ folder (i.e. not the homepage or mkdocs.yml) adding the flag --dirtyreload will make the site rebuild super crazy insta-fast. mkdocs serve --dirtyreload First, install the necessary extensions: npm install -g postcss postcss-cli autoprefixer http-server Once you have those npm packages installed, run: ./hack/build-with-blog.sh serve Note Unfortunately, there aren\u2019t live previews for this version of the local preview. After awhile, your terminal should spit out: INFO - Documentation built in 13 .54 seconds [ I 210519 10 :47:10 server:335 ] Serving on http://127.0.0.1:8000 [ I 210519 10 :47:10 handlers:62 ] Start watching changes [ I 210519 10 :47:10 handlers:64 ] Start detecting changes Now access http://127.0.0.1:8000 and you should see the site is built! \ud83c\udf89 Anytime you change any file in your /docs/ repo and hit save, the site will automatically rebuild itself to reflect your changes! Setting Up \"Public\" Preview \u00b6 If, for whatever reason, you want to share your work before submitting a PR (where Netlify would generate a preview for you), you can deploy your changes as a Github Page easily using the following command: mkdocs gh-deploy --force INFO - Documentation built in 14 .29 seconds WARNING - Version check skipped: No version specified in previous deployment. INFO - Your documentation should shortly be available at: https://<your-github-handle>.github.io/docs/ Where <your-github-handle> is your Github handle. After a few moments, your changes should be available for public preview at the link provided by MkDocs! This means you can rapidly prototype and share your changes before making a PR! Navigation \u00b6 Navigation in MkDocs uses the \"mkdocs.yml\" file (found in the /docs directory) to organize navigation. For more in-depth information on Navigation, see: https://www.mkdocs.org/user-guide/writing-your-docs/#configure-pages-and-navigation and https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/ Content Tabs \u00b6 Content tabs are handy way to organize lots of information in a visually pleasing way. Some documentation from https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage is reproduced here: Grouping Code blocks Grouping other content Code blocks are one of the primary targets to be grouped, and can be considered a special case of content tabs, as tabs with a single code block are always rendered without horizontal spacing. Example: === \"C\" ``` c #include <stdio.h> int main(void) { printf(\"Hello world!\\n\"); return 0; } ``` === \"C++\" ``` c++ #include <iostream> int main(void) { std::cout << \"Hello world!\" << std::endl; return 0; } ``` Result: C C++ #include <stdio.h> int main ( void ) { printf ( \"Hello world! \\n \" ); return 0 ; } #include <iostream> int main ( void ) { std :: cout << \"Hello world!\" << std :: endl ; return 0 ; } When a content tab contains more than one code block, it is rendered with horizontal spacing. Vertical spacing is never added, but can be achieved by nesting tabs in other blocks. Example: === \"Unordered list\" * Sed sagittis eleifend rutrum * Donec vitae suscipit est * Nulla tempor lobortis orci === \"Ordered list\" 1. Sed sagittis eleifend rutrum 2. Donec vitae suscipit est 3. Nulla tempor lobortis orci Result: Unordered list Ordered list Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci For more information, see: https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage File Includes (Content Reuse) \u00b6 KServe strives to reduce duplicative effort by reusing commonly used bits of information, see the docs/snippet directory for some examples. Snippets does not require a specific extension, and as long as a valid file name is specified, it will attempt to process it. Snippets can handle recursive file inclusion. And if Snippets encounters the same file in the current stack, it will avoid re-processing it in order to avoid an infinite loop (or crash on hitting max recursion depth). For more info, see: https://facelessuser.github.io/pymdown-extensions/extensions/snippets/ Admonitions \u00b6 We use the following admonition boxes only. Use admonitions sparingly; too many admonitions can be distracting. Admonitions Formatting Note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. Tip A Tip suggests an helpful, but not mandatory, action to take. Warning A Warning draws attention to potential trouble. !!! note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. !!! tip A Tip suggests a helpful, but not mandatory, action to take. !!! warning A Warning draws attention to potential trouble. Icons and Emojis \u00b6 Material for MkDocs supports using Material Icons and Emojis using easy shortcodes. Emojs Formatting :taco: To search a database of Icons and Emojis (all of which can be used on kserve.io), as well as usage information, see: https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/#search Redirects \u00b6 The KServe site uses mkdocs-redirects to \"redirect\" users from a page that may no longer exist (or has been moved) to their desired location. Adding re-directs to the KServe site is done in one centralized place, docs/config/redirects.yml . The format is shown here: plugins: redirects: redirect_maps: ... path_to_old_or_moved_URL : path_to_new_URL","title":"MkDocs Contributions"},{"location":"help/contributor/mkdocs-contributor-guide/#mkdocs-contributions","text":"This is a temporary home for contribution guidelines for the MkDocs branch. When MkDocs becomes \"main\" this will be moved to the appropriate place on the website","title":"MkDocs Contributions"},{"location":"help/contributor/mkdocs-contributor-guide/#install-material-for-mkdocs","text":"kserve.io uses Material for MkDocs to render documentation. Material for MkDocs is Python based and uses pip to install most of it's required packages as well as optional add-ons (which we use). You can choose to install MkDocs locally or using a Docker image. pip actually comes pre-installed with Python so it is included in many operating systems (like MacOSx or Ubuntu) but if you don\u2019t have Python, you can install it here: https://www.python.org For some (e.g. folks using RHEL), you may have to use pip3. pip pip3 pip install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation pip3 install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation","title":"Install Material for MkDocs"},{"location":"help/contributor/mkdocs-contributor-guide/#install-kserve-specific-extensions","text":"KServe uses a number of extensions to MkDocs which can also be installed using pip. If you used pip to install, run the following: pip pip3 pip install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects pip3 install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects","title":"Install KServe-Specific Extensions"},{"location":"help/contributor/mkdocs-contributor-guide/#install-dependencies-in-requirementstxt-file","text":"Navigate to root folder and run below command to install required packages and libraries specified in the requirements.txt file. pip pip3 pip install -r requirements.txt pip3 install -r requirements.txt","title":"Install Dependencies in Requirements.txt file"},{"location":"help/contributor/mkdocs-contributor-guide/#setting-up-local-preview","text":"Once you have installed Material for MkDocs and all of the extensions, head over to and clone the repo. In your terminal, find your way over to the location of the cloned repo. Once you are in the main folder and run: Local Preview Local Preview w/ Dirty Reload Local Preview including Blog and Community Site mkdocs serve If you\u2019re only changing a single page in the /docs/ folder (i.e. not the homepage or mkdocs.yml) adding the flag --dirtyreload will make the site rebuild super crazy insta-fast. mkdocs serve --dirtyreload First, install the necessary extensions: npm install -g postcss postcss-cli autoprefixer http-server Once you have those npm packages installed, run: ./hack/build-with-blog.sh serve Note Unfortunately, there aren\u2019t live previews for this version of the local preview. After awhile, your terminal should spit out: INFO - Documentation built in 13 .54 seconds [ I 210519 10 :47:10 server:335 ] Serving on http://127.0.0.1:8000 [ I 210519 10 :47:10 handlers:62 ] Start watching changes [ I 210519 10 :47:10 handlers:64 ] Start detecting changes Now access http://127.0.0.1:8000 and you should see the site is built! \ud83c\udf89 Anytime you change any file in your /docs/ repo and hit save, the site will automatically rebuild itself to reflect your changes!","title":"Setting Up Local Preview"},{"location":"help/contributor/mkdocs-contributor-guide/#setting-up-public-preview","text":"If, for whatever reason, you want to share your work before submitting a PR (where Netlify would generate a preview for you), you can deploy your changes as a Github Page easily using the following command: mkdocs gh-deploy --force INFO - Documentation built in 14 .29 seconds WARNING - Version check skipped: No version specified in previous deployment. INFO - Your documentation should shortly be available at: https://<your-github-handle>.github.io/docs/ Where <your-github-handle> is your Github handle. After a few moments, your changes should be available for public preview at the link provided by MkDocs! This means you can rapidly prototype and share your changes before making a PR!","title":"Setting Up \"Public\" Preview"},{"location":"help/contributor/mkdocs-contributor-guide/#navigation","text":"Navigation in MkDocs uses the \"mkdocs.yml\" file (found in the /docs directory) to organize navigation. For more in-depth information on Navigation, see: https://www.mkdocs.org/user-guide/writing-your-docs/#configure-pages-and-navigation and https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/","title":"Navigation"},{"location":"help/contributor/mkdocs-contributor-guide/#content-tabs","text":"Content tabs are handy way to organize lots of information in a visually pleasing way. Some documentation from https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage is reproduced here: Grouping Code blocks Grouping other content Code blocks are one of the primary targets to be grouped, and can be considered a special case of content tabs, as tabs with a single code block are always rendered without horizontal spacing. Example: === \"C\" ``` c #include <stdio.h> int main(void) { printf(\"Hello world!\\n\"); return 0; } ``` === \"C++\" ``` c++ #include <iostream> int main(void) { std::cout << \"Hello world!\" << std::endl; return 0; } ``` Result: C C++ #include <stdio.h> int main ( void ) { printf ( \"Hello world! \\n \" ); return 0 ; } #include <iostream> int main ( void ) { std :: cout << \"Hello world!\" << std :: endl ; return 0 ; } When a content tab contains more than one code block, it is rendered with horizontal spacing. Vertical spacing is never added, but can be achieved by nesting tabs in other blocks. Example: === \"Unordered list\" * Sed sagittis eleifend rutrum * Donec vitae suscipit est * Nulla tempor lobortis orci === \"Ordered list\" 1. Sed sagittis eleifend rutrum 2. Donec vitae suscipit est 3. Nulla tempor lobortis orci Result: Unordered list Ordered list Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci For more information, see: https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage","title":"Content Tabs"},{"location":"help/contributor/mkdocs-contributor-guide/#file-includes-content-reuse","text":"KServe strives to reduce duplicative effort by reusing commonly used bits of information, see the docs/snippet directory for some examples. Snippets does not require a specific extension, and as long as a valid file name is specified, it will attempt to process it. Snippets can handle recursive file inclusion. And if Snippets encounters the same file in the current stack, it will avoid re-processing it in order to avoid an infinite loop (or crash on hitting max recursion depth). For more info, see: https://facelessuser.github.io/pymdown-extensions/extensions/snippets/","title":"File Includes (Content Reuse)"},{"location":"help/contributor/mkdocs-contributor-guide/#admonitions","text":"We use the following admonition boxes only. Use admonitions sparingly; too many admonitions can be distracting. Admonitions Formatting Note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. Tip A Tip suggests an helpful, but not mandatory, action to take. Warning A Warning draws attention to potential trouble. !!! note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. !!! tip A Tip suggests a helpful, but not mandatory, action to take. !!! warning A Warning draws attention to potential trouble.","title":"Admonitions"},{"location":"help/contributor/mkdocs-contributor-guide/#icons-and-emojis","text":"Material for MkDocs supports using Material Icons and Emojis using easy shortcodes. Emojs Formatting :taco: To search a database of Icons and Emojis (all of which can be used on kserve.io), as well as usage information, see: https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/#search","title":"Icons and Emojis"},{"location":"help/contributor/mkdocs-contributor-guide/#redirects","text":"The KServe site uses mkdocs-redirects to \"redirect\" users from a page that may no longer exist (or has been moved) to their desired location. Adding re-directs to the KServe site is done in one centralized place, docs/config/redirects.yml . The format is shown here: plugins: redirects: redirect_maps: ... path_to_old_or_moved_URL : path_to_new_URL","title":"Redirects"},{"location":"help/contributor/templates/template-blog/","text":"Blog template instructions \u00b6 An example template with best-practices that you can use to start drafting an entry to post on the KServe blog. Copy a version of this template without the instructions Include a commented-out table with tracking info about reviews and approvals: <!-- | Reviewer | Date | Approval | | ------------------ | ---------- | ------------- | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> Blog content body \u00b6 <!-- Introduce the feature you are going to explain: * state what the goal of this blog entry is * how you use the feature * make sure to link to the corresponding docs * why others can find it useful (why its important) --> <!-- Add/create as many distinct Steps or Sections as needed. --> Example step/section 1: \u00b6 <!-- An introductory sentence about this step or section (ie. why its important and what the result is). Don't forget to link to any new or related concepts that you mention here. --> Example step/section 2: \u00b6 <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> Example step/section 3: \u00b6 <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> Example section about results \u00b6 <!-- Tie it all together and briefly revisit the main key points and then the overall result/goal/importance --> Further reading \u00b6 <!-- Add any links to other related resources that users might find useful. What's the next step? --> About the author \u00b6 <!-- Add a short bio of yourself here --> Copy the template \u00b6 <!-- | Reviewer | Date | Approval | | ------------------ | ---------- | ------------- | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> # <!-- Insert blog title here --> ## Blog content body <!-- Introduce the feature you are going to explain: * state what the goal of this blog entry is * how you use the feature * make sure to link to the corresponding docs * why others can find it useful (why its important) --> <!-- Add/create as many distinct Steps or Sections as needed. --> ### Example step/section 1: <!-- An introductory sentence about this step or section (ie. why its important and what the result is). Don't forget to link to any new or related concepts that you mention here. --> ### Example step/section 2: <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> ### Example step/section 3: <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> ### Example section about results <!-- Tie it all together and briefly revisit the main key points and then the overall result/goal/importance --> ## Further reading <!-- Add any links to related resources that users might find useful. What's the next step? --> ## About the author <!-- Add a short bio of yourself here -->","title":"Blog template instructions"},{"location":"help/contributor/templates/template-blog/#blog-template-instructions","text":"An example template with best-practices that you can use to start drafting an entry to post on the KServe blog. Copy a version of this template without the instructions Include a commented-out table with tracking info about reviews and approvals: <!-- | Reviewer | Date | Approval | | ------------------ | ---------- | ------------- | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | -->","title":"Blog template instructions"},{"location":"help/contributor/templates/template-blog/#blog-content-body","text":"<!-- Introduce the feature you are going to explain: * state what the goal of this blog entry is * how you use the feature * make sure to link to the corresponding docs * why others can find it useful (why its important) --> <!-- Add/create as many distinct Steps or Sections as needed. -->","title":"Blog content body"},{"location":"help/contributor/templates/template-blog/#example-stepsection-1","text":"<!-- An introductory sentence about this step or section (ie. why its important and what the result is). Don't forget to link to any new or related concepts that you mention here. -->","title":"Example step/section 1:"},{"location":"help/contributor/templates/template-blog/#example-stepsection-2","text":"<!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. -->","title":"Example step/section 2:"},{"location":"help/contributor/templates/template-blog/#example-stepsection-3","text":"<!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. -->","title":"Example step/section 3:"},{"location":"help/contributor/templates/template-blog/#example-section-about-results","text":"<!-- Tie it all together and briefly revisit the main key points and then the overall result/goal/importance -->","title":"Example section about results"},{"location":"help/contributor/templates/template-blog/#further-reading","text":"<!-- Add any links to other related resources that users might find useful. What's the next step? -->","title":"Further reading"},{"location":"help/contributor/templates/template-blog/#about-the-author","text":"<!-- Add a short bio of yourself here -->","title":"About the author"},{"location":"help/contributor/templates/template-blog/#copy-the-template","text":"<!-- | Reviewer | Date | Approval | | ------------------ | ---------- | ------------- | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | <!-- GitHub ID --> | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> # <!-- Insert blog title here --> ## Blog content body <!-- Introduce the feature you are going to explain: * state what the goal of this blog entry is * how you use the feature * make sure to link to the corresponding docs * why others can find it useful (why its important) --> <!-- Add/create as many distinct Steps or Sections as needed. --> ### Example step/section 1: <!-- An introductory sentence about this step or section (ie. why its important and what the result is). Don't forget to link to any new or related concepts that you mention here. --> ### Example step/section 2: <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> ### Example step/section 3: <!-- An introductory sentence about this step or section (ie. why its important, how it relates to the one before, and what the result is) Don't forget to link to any new or related concepts that you mention here. --> ### Example section about results <!-- Tie it all together and briefly revisit the main key points and then the overall result/goal/importance --> ## Further reading <!-- Add any links to related resources that users might find useful. What's the next step? --> ## About the author <!-- Add a short bio of yourself here -->","title":"Copy the template"},{"location":"help/contributor/templates/template-concept/","text":"Concept Template \u00b6 Use this template when writing conceptual topics. Conceptual topics explain how things work or what things mean. They provide helpful context to readers. They do not include procedures. Template \u00b6 The following template includes the standard sections that should appear in conceptual topics, including a topic introduction sentence, an overview, and placeholders for additional sections and subsections. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic describes what KServe is and how it works.\" ## Overview Write a few sentences describing the subject of the topic. ## Section Title Write a sentence or two to describe the content in this section. Create more sections as necessary. Optionally, add two or more subsections to each section. Do not skip header levels: H2 >> H3, not H2 >> H4. ### Subsection Title Write a sentence or two to describe the content in this section. ### Subsection Title Write a sentence or two to describe the content in this section. Conceptual Content Samples \u00b6 This section provides common content types that appear in conceptual topics. Copy and paste the markdown to use it in your topic. Table \u00b6 Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.\u201d Markdown Table Template \u00b6 Header 1 Header 2 Data1 Data2 Data3 Data4 Ordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item. Markdown Ordered List Templates \u00b6 Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3 Unordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item. Markdown Unordered List Template \u00b6 List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item Note \u00b6 Ensure the text beneath the note is indented as much as note is. Note This is a note. Warning \u00b6 If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Concept Template"},{"location":"help/contributor/templates/template-concept/#concept-template","text":"Use this template when writing conceptual topics. Conceptual topics explain how things work or what things mean. They provide helpful context to readers. They do not include procedures.","title":"Concept Template"},{"location":"help/contributor/templates/template-concept/#template","text":"The following template includes the standard sections that should appear in conceptual topics, including a topic introduction sentence, an overview, and placeholders for additional sections and subsections. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic describes what KServe is and how it works.\" ## Overview Write a few sentences describing the subject of the topic. ## Section Title Write a sentence or two to describe the content in this section. Create more sections as necessary. Optionally, add two or more subsections to each section. Do not skip header levels: H2 >> H3, not H2 >> H4. ### Subsection Title Write a sentence or two to describe the content in this section. ### Subsection Title Write a sentence or two to describe the content in this section.","title":"Template"},{"location":"help/contributor/templates/template-concept/#conceptual-content-samples","text":"This section provides common content types that appear in conceptual topics. Copy and paste the markdown to use it in your topic.","title":"Conceptual Content Samples"},{"location":"help/contributor/templates/template-concept/#table","text":"Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.\u201d","title":"Table"},{"location":"help/contributor/templates/template-concept/#markdown-table-template","text":"Header 1 Header 2 Data1 Data2 Data3 Data4","title":"Markdown Table Template"},{"location":"help/contributor/templates/template-concept/#ordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item.","title":"Ordered List"},{"location":"help/contributor/templates/template-concept/#markdown-ordered-list-templates","text":"Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3","title":"Markdown Ordered List Templates"},{"location":"help/contributor/templates/template-concept/#unordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item.","title":"Unordered List"},{"location":"help/contributor/templates/template-concept/#markdown-unordered-list-template","text":"List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item","title":"Markdown Unordered List Template"},{"location":"help/contributor/templates/template-concept/#note","text":"Ensure the text beneath the note is indented as much as note is. Note This is a note.","title":"Note"},{"location":"help/contributor/templates/template-concept/#warning","text":"If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Warning"},{"location":"help/contributor/templates/template-procedure/","text":"Procedure template \u00b6 Use this template when writing procedural (how-to) topics. Procedural topics include detailed steps to perform a task as well as some context about the task. Template \u00b6 The following template includes the standard sections that should appear in procedural topics, including a topic sentence, an overview section, and sections for each task within the procedure. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic instructs how to serve a TensorFlow model.\" ## Overview Write a few sentences to describe the subject of the topic, if useful. For example, if the topic is about configuring a broker, you might provide some useful context about brokers. If there are multiple tasks in the procedure and they must be completed in order, create an ordered list that contains each task in the topic. Use bullets for sub-tasks. Include anchor links to the headings for each task. To [task]: 1. [Name of Task 1 (for example, Apply default configuration)](#task-1) 1. [Optional: Name of Task 2](#task-2) !!! note Unless the number of tasks in the procedure is particularly high, do not use numbered lead-ins in the task headings. For example, instead of \"Task 1: Apply default configuration\", use \"Apply default configuration\". ## Prerequisites Use one of the following formats for the Prerequisites section. ### Formatting for two or more prerequisites If there are two or more prerequisites, use the following format. Include links for more information, if necessary. Before you [task], you must have/do: * Prerequisite. See [Link](). * Prerequisite. See [Link](). For example: Before you deploy PyTorch model, you must have: * KServe. See [Installing the KServe](link-to-that-topic). * An Apache Kafka cluster. See [Link to Instructions to Download](link-to-that-topic). ### Format for one prerequisite If there is one prerequisite, use the following format. Include a link for more information, if necessary. Before you [task], you must have/do [prerequisite]. See [Link](link). For example: Before you create the `InferenceService`, you must have a Kubernetes cluster with KServe installed and DNS configured. See the [installation instructions](../../../install/README.md) if you need to create one. ## Task 1 Write a few sentences to describe the task and provide additional context on the task. !!! note When writing a single-step procedure, write the step in one sentence and make it a bullet. The signposting is important given readers are strongly inclined to look for numbered steps and bullet points when searching for instructions. If possible, expand the procedure to include at least one more step. Few procedures truly require a single step. [Task]: 1. Step 1 1. Step 2 ## Optional: Task 2 If the task is optional, put \"Optional:\" in the heading. Write a few sentences to describe the task and provide additional context on the task. [Task]: 1. Step 1 2. Step 2 Procedure Content Samples \u00b6 This section provides common content types that appear in procedural topics. Copy and paste the markdown to use it in your topic. \u201cFill-in-the-Fields\u201d Table \u00b6 Where the reader must enter many values in, for example, a YAML file, use a table within the procedure as follows: Open the YAML file. Key1 : Value1 Key2 : Value2 metadata : annotations : # case-sensitive Key3 : Value3 Key4 : Value4 Key5 : Value5 spec : # Configuration specific to this broker. config : Key6 : Value6 Change the relevant values to your needs, using the following table as a guide. Key Value Type Description Key1 String Description Key2 Integer Description Key3 String Description Key4 String Description Key5 Float Description Key6 String Description Table \u00b6 Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework. Markdown Table Template \u00b6 Header 1 Header 2 Data1 Data2 Data3 Data4 Ordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item. Markdown Ordered List Templates \u00b6 Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3 Unordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item. Markdown Unordered List Template \u00b6 List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item Note \u00b6 Ensure the text beneath the note is indented as much as note is. Note This is a note. Warning \u00b6 If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning. Markdown Embedded Image \u00b6 The following is an embedded image reference in markdown. Tabs \u00b6 Place multiple versions of the same procedure (such as a CLI procedure vs a YAML procedure) within tabs. Indent the opening tabs tags 3 spaces to make the tabs display properly. == \"tab1 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. == \"tab2 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. Documenting Code and Code Snippets \u00b6 For instructions on how to format code and code snippets, see the Style Guide.","title":"Procedure template"},{"location":"help/contributor/templates/template-procedure/#procedure-template","text":"Use this template when writing procedural (how-to) topics. Procedural topics include detailed steps to perform a task as well as some context about the task.","title":"Procedure template"},{"location":"help/contributor/templates/template-procedure/#template","text":"The following template includes the standard sections that should appear in procedural topics, including a topic sentence, an overview section, and sections for each task within the procedure. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic instructs how to serve a TensorFlow model.\" ## Overview Write a few sentences to describe the subject of the topic, if useful. For example, if the topic is about configuring a broker, you might provide some useful context about brokers. If there are multiple tasks in the procedure and they must be completed in order, create an ordered list that contains each task in the topic. Use bullets for sub-tasks. Include anchor links to the headings for each task. To [task]: 1. [Name of Task 1 (for example, Apply default configuration)](#task-1) 1. [Optional: Name of Task 2](#task-2) !!! note Unless the number of tasks in the procedure is particularly high, do not use numbered lead-ins in the task headings. For example, instead of \"Task 1: Apply default configuration\", use \"Apply default configuration\". ## Prerequisites Use one of the following formats for the Prerequisites section. ### Formatting for two or more prerequisites If there are two or more prerequisites, use the following format. Include links for more information, if necessary. Before you [task], you must have/do: * Prerequisite. See [Link](). * Prerequisite. See [Link](). For example: Before you deploy PyTorch model, you must have: * KServe. See [Installing the KServe](link-to-that-topic). * An Apache Kafka cluster. See [Link to Instructions to Download](link-to-that-topic). ### Format for one prerequisite If there is one prerequisite, use the following format. Include a link for more information, if necessary. Before you [task], you must have/do [prerequisite]. See [Link](link). For example: Before you create the `InferenceService`, you must have a Kubernetes cluster with KServe installed and DNS configured. See the [installation instructions](../../../install/README.md) if you need to create one. ## Task 1 Write a few sentences to describe the task and provide additional context on the task. !!! note When writing a single-step procedure, write the step in one sentence and make it a bullet. The signposting is important given readers are strongly inclined to look for numbered steps and bullet points when searching for instructions. If possible, expand the procedure to include at least one more step. Few procedures truly require a single step. [Task]: 1. Step 1 1. Step 2 ## Optional: Task 2 If the task is optional, put \"Optional:\" in the heading. Write a few sentences to describe the task and provide additional context on the task. [Task]: 1. Step 1 2. Step 2","title":"Template"},{"location":"help/contributor/templates/template-procedure/#procedure-content-samples","text":"This section provides common content types that appear in procedural topics. Copy and paste the markdown to use it in your topic.","title":"Procedure Content Samples"},{"location":"help/contributor/templates/template-procedure/#fill-in-the-fields-table","text":"Where the reader must enter many values in, for example, a YAML file, use a table within the procedure as follows: Open the YAML file. Key1 : Value1 Key2 : Value2 metadata : annotations : # case-sensitive Key3 : Value3 Key4 : Value4 Key5 : Value5 spec : # Configuration specific to this broker. config : Key6 : Value6 Change the relevant values to your needs, using the following table as a guide. Key Value Type Description Key1 String Description Key2 Integer Description Key3 String Description Key4 String Description Key5 Float Description Key6 String Description","title":"\u201cFill-in-the-Fields\u201d Table"},{"location":"help/contributor/templates/template-procedure/#table","text":"Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.","title":"Table"},{"location":"help/contributor/templates/template-procedure/#markdown-table-template","text":"Header 1 Header 2 Data1 Data2 Data3 Data4","title":"Markdown Table Template"},{"location":"help/contributor/templates/template-procedure/#ordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item.","title":"Ordered List"},{"location":"help/contributor/templates/template-procedure/#markdown-ordered-list-templates","text":"Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3","title":"Markdown Ordered List Templates"},{"location":"help/contributor/templates/template-procedure/#unordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item.","title":"Unordered List"},{"location":"help/contributor/templates/template-procedure/#markdown-unordered-list-template","text":"List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item","title":"Markdown Unordered List Template"},{"location":"help/contributor/templates/template-procedure/#note","text":"Ensure the text beneath the note is indented as much as note is. Note This is a note.","title":"Note"},{"location":"help/contributor/templates/template-procedure/#warning","text":"If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Warning"},{"location":"help/contributor/templates/template-procedure/#markdown-embedded-image","text":"The following is an embedded image reference in markdown.","title":"Markdown Embedded Image"},{"location":"help/contributor/templates/template-procedure/#tabs","text":"Place multiple versions of the same procedure (such as a CLI procedure vs a YAML procedure) within tabs. Indent the opening tabs tags 3 spaces to make the tabs display properly. == \"tab1 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. == \"tab2 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step.","title":"Tabs"},{"location":"help/contributor/templates/template-procedure/#documenting-code-and-code-snippets","text":"For instructions on how to format code and code snippets, see the Style Guide.","title":"Documenting Code and Code Snippets"},{"location":"help/contributor/templates/template-troubleshooting/","text":"Troubleshooting template \u00b6 When writing guidance to help to troubleshoot specific errors, the error must include: Error Description: To describe the error very briefly so that users can search for it easily. Symptom: To describe the error in a way that helps users to diagnose their issue. Include error messages or anything else users might see if they encounter this error. Explanation (or cause): To inform users about why they are seeing this error. This can be omitted if the cause of the error is unknown. Solution: To inform the user about how to fix the error. Example Troubleshooting Table \u00b6 Troubleshooting \u00b6 | Error Description | |----------|------------| | Symptom | During the event something breaks. | | Cause | The thing is broken. | | Solution | To solve this issue, do the following: 1. This. 2. That. |","title":"Troubleshooting template"},{"location":"help/contributor/templates/template-troubleshooting/#troubleshooting-template","text":"When writing guidance to help to troubleshoot specific errors, the error must include: Error Description: To describe the error very briefly so that users can search for it easily. Symptom: To describe the error in a way that helps users to diagnose their issue. Include error messages or anything else users might see if they encounter this error. Explanation (or cause): To inform users about why they are seeing this error. This can be omitted if the cause of the error is unknown. Solution: To inform the user about how to fix the error.","title":"Troubleshooting template"},{"location":"help/contributor/templates/template-troubleshooting/#example-troubleshooting-table","text":"","title":"Example Troubleshooting Table"},{"location":"help/contributor/templates/template-troubleshooting/#troubleshooting","text":"| Error Description | |----------|------------| | Symptom | During the event something breaks. | | Cause | The thing is broken. | | Solution | To solve this issue, do the following: 1. This. 2. That. |","title":"Troubleshooting"},{"location":"help/style-guide/documenting-code/","text":"Documenting Code \u00b6 Words requiring code formatting \u00b6 Apply code formatting only to special-purpose text: Filenames Path names Fields and values from a YAML file Any text that goes into a CLI CLI names Specify the programming language \u00b6 Specify the language your code is in as part of the code block Specify non-language specific code, like CLI commands, with ```bash. See the following examples for formatting. Correct Incorrect Correct Formatting Incorrect Formatting package main import \"fmt\" func main () { fmt . Println ( \"hello world\" ) } package main import \"fmt\" func main () { fmt.Println ( \"hello world\" ) } ```go package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` ```bash package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` Documenting YAML \u00b6 When documenting YAML, use two steps. Use step 1 to create the YAML file, and step 2 to apply the YAML file. Use kubectl apply for files/objects that the user creates: it works for both \u201ccreate\u201d and \u201cupdate\u201d, and the source of truth is their local files. Use kubectl edit for files which are shipped as part of the KServe software, like the KServe ConfigMaps. Write ```yaml at the beginning of your code block if you are typing YAML code as part of a CLI command. Correct Incorrect Creating or updating a resource: Create a YAML file using the following template: # YAML FILE CONTENTS Apply the YAML file by running the command: kubectl apply -f <filename>.yaml Where <filename> is the name of the file you created in the previous step. Editing a ConfigMap: kubectl -n <namespace> edit configmap <resource-name> Example 1: cat <<EOF | kubectl create -f - # code EOF Example 2: kubectl apply -f - <<EOF # code EOF Referencing variables in code blocks \u00b6 Format variables in code blocks like so: All lowercase Hyphens between words Explanation for each variable below code block Explanation format is \u201cWhere... <service-name> is\u2026\" Single variable \u00b6 Correct Incorrect kubectl get isvc <service-name> Where <service-name> is the name of your InferenceService. kubectl get isvc { SERVICE_NAME } {SERVICE_NAME} = The name of your service Multiple variables \u00b6 Correct Incorrect kn create service <service-name> --revision-name <revision-name> Where: <service-name> is the name of your Knative Service. <revision-name> is the desired name of your revision. kn create service <service-name> --revision-name <revision-name> Where <service-name> is the name of your Knative Service. Where <revision-name> is the desired name of your revision. CLI output \u00b6 CLI Output should include the custom css \"{ .bash .no-copy }\" in place of \"bash\" which removes the \"Copy to clipboard button\" on the right side of the code block Correct Incorrect Correct Formatting Incorrect Formatting <some-code> <some-code> ```{ .bash .no-copy } <some-code> ``` ```bash <some-code> ```","title":"Documenting Code"},{"location":"help/style-guide/documenting-code/#documenting-code","text":"","title":"Documenting Code"},{"location":"help/style-guide/documenting-code/#words-requiring-code-formatting","text":"Apply code formatting only to special-purpose text: Filenames Path names Fields and values from a YAML file Any text that goes into a CLI CLI names","title":"Words requiring code formatting"},{"location":"help/style-guide/documenting-code/#specify-the-programming-language","text":"Specify the language your code is in as part of the code block Specify non-language specific code, like CLI commands, with ```bash. See the following examples for formatting. Correct Incorrect Correct Formatting Incorrect Formatting package main import \"fmt\" func main () { fmt . Println ( \"hello world\" ) } package main import \"fmt\" func main () { fmt.Println ( \"hello world\" ) } ```go package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` ```bash package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ```","title":"Specify the programming language"},{"location":"help/style-guide/documenting-code/#documenting-yaml","text":"When documenting YAML, use two steps. Use step 1 to create the YAML file, and step 2 to apply the YAML file. Use kubectl apply for files/objects that the user creates: it works for both \u201ccreate\u201d and \u201cupdate\u201d, and the source of truth is their local files. Use kubectl edit for files which are shipped as part of the KServe software, like the KServe ConfigMaps. Write ```yaml at the beginning of your code block if you are typing YAML code as part of a CLI command. Correct Incorrect Creating or updating a resource: Create a YAML file using the following template: # YAML FILE CONTENTS Apply the YAML file by running the command: kubectl apply -f <filename>.yaml Where <filename> is the name of the file you created in the previous step. Editing a ConfigMap: kubectl -n <namespace> edit configmap <resource-name> Example 1: cat <<EOF | kubectl create -f - # code EOF Example 2: kubectl apply -f - <<EOF # code EOF","title":"Documenting YAML"},{"location":"help/style-guide/documenting-code/#referencing-variables-in-code-blocks","text":"Format variables in code blocks like so: All lowercase Hyphens between words Explanation for each variable below code block Explanation format is \u201cWhere... <service-name> is\u2026\"","title":"Referencing variables in code blocks"},{"location":"help/style-guide/documenting-code/#single-variable","text":"Correct Incorrect kubectl get isvc <service-name> Where <service-name> is the name of your InferenceService. kubectl get isvc { SERVICE_NAME } {SERVICE_NAME} = The name of your service","title":"Single variable"},{"location":"help/style-guide/documenting-code/#multiple-variables","text":"Correct Incorrect kn create service <service-name> --revision-name <revision-name> Where: <service-name> is the name of your Knative Service. <revision-name> is the desired name of your revision. kn create service <service-name> --revision-name <revision-name> Where <service-name> is the name of your Knative Service. Where <revision-name> is the desired name of your revision.","title":"Multiple variables"},{"location":"help/style-guide/documenting-code/#cli-output","text":"CLI Output should include the custom css \"{ .bash .no-copy }\" in place of \"bash\" which removes the \"Copy to clipboard button\" on the right side of the code block Correct Incorrect Correct Formatting Incorrect Formatting <some-code> <some-code> ```{ .bash .no-copy } <some-code> ``` ```bash <some-code> ```","title":"CLI output"},{"location":"help/style-guide/style-and-formatting/","text":"Formatting standards and conventions \u00b6 Titles and headings \u00b6 Use sentence case for titles and headings \u00b6 Only capitalize proper nouns, acronyms, and the first word of the heading. Correct Incorrect ## Configure the feature ## Configure the Feature ### Using feature ### Using Feature ### Using HTTPS ### Using https Do not use code formatting inside headings \u00b6 Correct Incorrect ## Configure the class annotation ## Configure the `class` annotation Use imperatives for headings of procedures \u00b6 For consistency, brevity, and to better signpost where action is expected of the reader, make procedure headings imperatives. Correct Incorrect ## Install KServe ## Installation of KServe ### Configure DNS ### Configuring DNS ## Verify the installation ## How to verify the installation Links \u00b6 Describe what the link targets \u00b6 Correct Incorrect For an explanation of what makes a good hyperlink, see this this article . See this article here . Write links in Markdown, not HTML \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) <a href=\"../kafka-broker/README.md\">Kafka Broker</a> [Kafka Broker](../kafka-broker/README.md){target=_blank} <a href=\"../kafka-broker/README.md\" target=\"_blank\">Kafka Broker</a> Include the .md extension in internal links \u00b6 Correct Incorrect [Setting up a custom domain](../serving/using-a-custom-domain.md) [Setting up a custom domain](../serving/using-a-custom-domain) Link to files, not folders \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/) Ensure the letter case is correct \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/readme.md) Formatting \u00b6 Use nonbreaking spaces in units of measurement other than percent \u00b6 For most units of measurement, when you specify a number with the unit, use a nonbreaking space between the number and the unit. Don't use spacing when the unit of measurement is percent. Correct Incorrect 3 &nbsp GB 3 GB 4 &nbsp CPUs 4 CPUs 14% 14 &nbsp % Use bold for user interface elements \u00b6 Correct Incorrect Click Fork Click \"Fork\" Select Other Select \"Other\" Use tables for definition lists \u00b6 When listing terms and their definitions, use table formatting instead of definition list formatting. Correct Incorrect |Value |Description | |------|---------------------| |Value1|Description of Value1| |Value2|Description of Value2| Value1 : Description of Value1 Value2 : Description of Value2 General style \u00b6 Use upper camel case for KServe API objects \u00b6 Correct Incorrect Explainers explainers Transformer transformer InferenceService Inference Service Only use parentheses for acronym explanations \u00b6 Put an acronym inside parentheses after its explanation. Don\u2019t use parentheses for anything else. Parenthetical statements especially should be avoided because readers skip them. If something is important enough to be in the sentence, it should be fully part of that sentence. Correct Incorrect Custom Resource Definition (CRD) Check your CLI (you should see it there) Knative Serving creates a Revision Knative creates a Revision (a stateless, snapshot in time of your code and configuration) Use the international standard for punctuation inside quotes \u00b6 Correct Incorrect Events are recorded with an associated \"stage\". Events are recorded with an associated \"stage.\" The copy is called a \"fork\". The copy is called a \"fork.\"","title":"Formatting standards and conventions"},{"location":"help/style-guide/style-and-formatting/#formatting-standards-and-conventions","text":"","title":"Formatting standards and conventions"},{"location":"help/style-guide/style-and-formatting/#titles-and-headings","text":"","title":"Titles and headings"},{"location":"help/style-guide/style-and-formatting/#use-sentence-case-for-titles-and-headings","text":"Only capitalize proper nouns, acronyms, and the first word of the heading. Correct Incorrect ## Configure the feature ## Configure the Feature ### Using feature ### Using Feature ### Using HTTPS ### Using https","title":"Use sentence case for titles and headings"},{"location":"help/style-guide/style-and-formatting/#do-not-use-code-formatting-inside-headings","text":"Correct Incorrect ## Configure the class annotation ## Configure the `class` annotation","title":"Do not use code formatting inside headings"},{"location":"help/style-guide/style-and-formatting/#use-imperatives-for-headings-of-procedures","text":"For consistency, brevity, and to better signpost where action is expected of the reader, make procedure headings imperatives. Correct Incorrect ## Install KServe ## Installation of KServe ### Configure DNS ### Configuring DNS ## Verify the installation ## How to verify the installation","title":"Use imperatives for headings of procedures"},{"location":"help/style-guide/style-and-formatting/#links","text":"","title":"Links"},{"location":"help/style-guide/style-and-formatting/#describe-what-the-link-targets","text":"Correct Incorrect For an explanation of what makes a good hyperlink, see this this article . See this article here .","title":"Describe what the link targets"},{"location":"help/style-guide/style-and-formatting/#write-links-in-markdown-not-html","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) <a href=\"../kafka-broker/README.md\">Kafka Broker</a> [Kafka Broker](../kafka-broker/README.md){target=_blank} <a href=\"../kafka-broker/README.md\" target=\"_blank\">Kafka Broker</a>","title":"Write links in Markdown, not HTML"},{"location":"help/style-guide/style-and-formatting/#include-the-md-extension-in-internal-links","text":"Correct Incorrect [Setting up a custom domain](../serving/using-a-custom-domain.md) [Setting up a custom domain](../serving/using-a-custom-domain)","title":"Include the .md extension in internal links"},{"location":"help/style-guide/style-and-formatting/#link-to-files-not-folders","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/)","title":"Link to files, not folders"},{"location":"help/style-guide/style-and-formatting/#ensure-the-letter-case-is-correct","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/readme.md)","title":"Ensure the letter case is correct"},{"location":"help/style-guide/style-and-formatting/#formatting","text":"","title":"Formatting"},{"location":"help/style-guide/style-and-formatting/#use-nonbreaking-spaces-in-units-of-measurement-other-than-percent","text":"For most units of measurement, when you specify a number with the unit, use a nonbreaking space between the number and the unit. Don't use spacing when the unit of measurement is percent. Correct Incorrect 3 &nbsp GB 3 GB 4 &nbsp CPUs 4 CPUs 14% 14 &nbsp %","title":"Use nonbreaking spaces in units of measurement other than percent"},{"location":"help/style-guide/style-and-formatting/#use-bold-for-user-interface-elements","text":"Correct Incorrect Click Fork Click \"Fork\" Select Other Select \"Other\"","title":"Use bold for user interface elements"},{"location":"help/style-guide/style-and-formatting/#use-tables-for-definition-lists","text":"When listing terms and their definitions, use table formatting instead of definition list formatting. Correct Incorrect |Value |Description | |------|---------------------| |Value1|Description of Value1| |Value2|Description of Value2| Value1 : Description of Value1 Value2 : Description of Value2","title":"Use tables for definition lists"},{"location":"help/style-guide/style-and-formatting/#general-style","text":"","title":"General style"},{"location":"help/style-guide/style-and-formatting/#use-upper-camel-case-for-kserve-api-objects","text":"Correct Incorrect Explainers explainers Transformer transformer InferenceService Inference Service","title":"Use upper camel case for KServe API objects"},{"location":"help/style-guide/style-and-formatting/#only-use-parentheses-for-acronym-explanations","text":"Put an acronym inside parentheses after its explanation. Don\u2019t use parentheses for anything else. Parenthetical statements especially should be avoided because readers skip them. If something is important enough to be in the sentence, it should be fully part of that sentence. Correct Incorrect Custom Resource Definition (CRD) Check your CLI (you should see it there) Knative Serving creates a Revision Knative creates a Revision (a stateless, snapshot in time of your code and configuration)","title":"Only use parentheses for acronym explanations"},{"location":"help/style-guide/style-and-formatting/#use-the-international-standard-for-punctuation-inside-quotes","text":"Correct Incorrect Events are recorded with an associated \"stage\". Events are recorded with an associated \"stage.\" The copy is called a \"fork\". The copy is called a \"fork.\"","title":"Use the international standard for punctuation inside quotes"},{"location":"help/style-guide/voice-and-language/","text":"Voice and language \u00b6 Use present tense \u00b6 Correct Incorrect This command starts a proxy. This command will start a proxy. Use active voice \u00b6 Correct Incorrect You can explore the API using a browser. The API can be explored using a browser. The YAML file specifies the replica count. The replica count is specified in the YAML file. Use simple and direct language \u00b6 Use simple and direct language. Avoid using unnecessary words, such as \"please\". Correct Incorrect To create a ReplicaSet , ... In order to create a ReplicaSet , ... See the configuration file. Please see the configuration file. View the Pods. With this next command, we'll view the Pods. Address the reader as \"you\", not \"we\" \u00b6 Correct Incorrect You can create a Deployment by ... We can create a Deployment by ... In the preceding output, you can see... In the preceding output, we can see ... This page teaches you how to use pods. In this page, we are going to learn about pods. Avoid jargon, idioms, and Latin \u00b6 Some readers speak English as a second language. Avoid jargon, idioms, and Latin to help make their understanding easier. Correct Incorrect Internally, ... Under the hood, ... Create a new cluster. Turn up a new cluster. Initially, ... Out of the box, ... For example, ... e.g., ... Enter through the gateway ... Enter via the gateway ... Avoid statements about the future \u00b6 Avoid making promises or giving hints about the future. If you need to talk about a feature in development, add a boilerplate under the front matter that identifies the information accordingly. Avoid statements that will soon be out of date \u00b6 Avoid using wording that becomes outdated quickly like \"currently\" and \"new\". A feature that is new today is not new for long. Correct Incorrect In version 1.4, ... In the current version, ... The Federation feature provides ... The new Federation feature provides ... Avoid words that assume a specific level of understanding \u00b6 Avoid words such as \"just\", \"simply\", \"easy\", \"easily\", or \"simple\". These words do not add value. Correct Incorrect Include one command in ... Include just one command in ... Run the container ... Simply run the container ... You can remove ... You can easily remove ... These steps ... These simple steps ...","title":"Voice and language"},{"location":"help/style-guide/voice-and-language/#voice-and-language","text":"","title":"Voice and language"},{"location":"help/style-guide/voice-and-language/#use-present-tense","text":"Correct Incorrect This command starts a proxy. This command will start a proxy.","title":"Use present tense"},{"location":"help/style-guide/voice-and-language/#use-active-voice","text":"Correct Incorrect You can explore the API using a browser. The API can be explored using a browser. The YAML file specifies the replica count. The replica count is specified in the YAML file.","title":"Use active voice"},{"location":"help/style-guide/voice-and-language/#use-simple-and-direct-language","text":"Use simple and direct language. Avoid using unnecessary words, such as \"please\". Correct Incorrect To create a ReplicaSet , ... In order to create a ReplicaSet , ... See the configuration file. Please see the configuration file. View the Pods. With this next command, we'll view the Pods.","title":"Use simple and direct language"},{"location":"help/style-guide/voice-and-language/#address-the-reader-as-you-not-we","text":"Correct Incorrect You can create a Deployment by ... We can create a Deployment by ... In the preceding output, you can see... In the preceding output, we can see ... This page teaches you how to use pods. In this page, we are going to learn about pods.","title":"Address the reader as \"you\", not \"we\""},{"location":"help/style-guide/voice-and-language/#avoid-jargon-idioms-and-latin","text":"Some readers speak English as a second language. Avoid jargon, idioms, and Latin to help make their understanding easier. Correct Incorrect Internally, ... Under the hood, ... Create a new cluster. Turn up a new cluster. Initially, ... Out of the box, ... For example, ... e.g., ... Enter through the gateway ... Enter via the gateway ...","title":"Avoid jargon, idioms, and Latin"},{"location":"help/style-guide/voice-and-language/#avoid-statements-about-the-future","text":"Avoid making promises or giving hints about the future. If you need to talk about a feature in development, add a boilerplate under the front matter that identifies the information accordingly.","title":"Avoid statements about the future"},{"location":"help/style-guide/voice-and-language/#avoid-statements-that-will-soon-be-out-of-date","text":"Avoid using wording that becomes outdated quickly like \"currently\" and \"new\". A feature that is new today is not new for long. Correct Incorrect In version 1.4, ... In the current version, ... The Federation feature provides ... The new Federation feature provides ...","title":"Avoid statements that will soon be out of date"},{"location":"help/style-guide/voice-and-language/#avoid-words-that-assume-a-specific-level-of-understanding","text":"Avoid words such as \"just\", \"simply\", \"easy\", \"easily\", or \"simple\". These words do not add value. Correct Incorrect Include one command in ... Include just one command in ... Run the container ... Simply run the container ... You can remove ... You can easily remove ... These steps ... These simple steps ...","title":"Avoid words that assume a specific level of understanding"},{"location":"modelserving/control_plane/","text":"Control Plane \u00b6 KServe Control Plane : Responsible for reconciling the InferenceService custom resources. It creates the Knative serverless deployment for predictor, transformer, explainer to enable autoscaling based on incoming request workload including scaling down to zero when no traffic is received. When raw deployment mode is enabled, control plane creates Kubernetes deployment, service, ingress, HPA. Control Plane Components \u00b6 KServe Controller : Responsible for creating service, ingress resources, model server container and model agent container for request/response logging , batching and model pulling. Ingress Gateway : Gateway for routing external or internal requests. In Serverless Mode: Knative Serving Controller : Responsible for service revision management, creating network routing resources, serverless container with queue proxy to expose traffic metrics and enforce concurrency limit. Knative Activator : Brings back scaled-to-zero pods and forwards requests. Knative Autoscaler(KPA) : Watches traffic flow to the application, and scales replicas up or down based on configured metrics.","title":"Model Serving Control Plane"},{"location":"modelserving/control_plane/#control-plane","text":"KServe Control Plane : Responsible for reconciling the InferenceService custom resources. It creates the Knative serverless deployment for predictor, transformer, explainer to enable autoscaling based on incoming request workload including scaling down to zero when no traffic is received. When raw deployment mode is enabled, control plane creates Kubernetes deployment, service, ingress, HPA.","title":"Control Plane"},{"location":"modelserving/control_plane/#control-plane-components","text":"KServe Controller : Responsible for creating service, ingress resources, model server container and model agent container for request/response logging , batching and model pulling. Ingress Gateway : Gateway for routing external or internal requests. In Serverless Mode: Knative Serving Controller : Responsible for service revision management, creating network routing resources, serverless container with queue proxy to expose traffic metrics and enforce concurrency limit. Knative Activator : Brings back scaled-to-zero pods and forwards requests. Knative Autoscaler(KPA) : Watches traffic flow to the application, and scales replicas up or down based on configured metrics.","title":"Control Plane Components"},{"location":"modelserving/servingruntimes/","text":"Macro Syntax Error \u00b6 File : modelserving/servingruntimes.md Line 83 in Markdown file: unexpected '.' > **Note:** `ServingRuntimes` support the use of template variables of the form `{{.Variable}}` inside the container spec. These should map to fields inside an","title":"Serving Runtimes"},{"location":"modelserving/servingruntimes/#macro-syntax-error","text":"File : modelserving/servingruntimes.md Line 83 in Markdown file: unexpected '.' > **Note:** `ServingRuntimes` support the use of template variables of the form `{{.Variable}}` inside the container spec. These should map to fields inside an","title":"Macro Syntax Error"},{"location":"modelserving/autoscaling/autoscaling/","text":"Autoscale InferenceService with inference workload \u00b6 InferenceService with target concurrency \u00b6 Create InferenceService \u00b6 Apply the tensorflow example CR with scaling target set to 1. Annotation autoscaling.knative.dev/target is the soft limit rather than a strictly enforced limit, if there is sudden burst of the requests, this value can be exceeded. The scaleTarget and scaleMetric are introduced in version 0.9 of kserve and should be available in both new and old schema. This is the preferred way of defining autoscaling options. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale.yaml to create the Autoscale InferenceService. kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created Predict InferenceService with concurrent requests \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send traffic in 30 seconds spurts maintaining 5 in-flight requests. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0193 secs Slowest: 10 .1458 secs Fastest: 0 .0127 secs Average: 0 .0364 secs Requests/sec: 137 .4449 Total data: 1019122 bytes Size/request: 247 bytes Response time histogram: 0 .013 [ 1 ] | 1 .026 [ 4120 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .039 [ 0 ] | 3 .053 [ 0 ] | 4 .066 [ 0 ] | 5 .079 [ 0 ] | 6 .093 [ 0 ] | 7 .106 [ 0 ] | 8 .119 [ 0 ] | 9 .133 [ 0 ] | 10 .146 [ 5 ] | Latency distribution: 10 % in 0 .0178 secs 25 % in 0 .0188 secs 50 % in 0 .0199 secs 75 % in 0 .0210 secs 90 % in 0 .0231 secs 95 % in 0 .0328 secs 99 % in 0 .1501 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0002 secs, 0 .0127 secs, 10 .1458 secs DNS-lookup: 0 .0002 secs, 0 .0000 secs, 0 .1502 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0020 secs resp wait: 0 .0360 secs, 0 .0125 secs, 9 .9791 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4126 responses Check the number of running pods now, Kserve uses Knative Serving autoscaler which is based on the average number of in-flight requests per pod(concurrency). As the scaling target is set to 1 and we load the service with 5 concurrent requests, so the autoscaler tries scaling up to 5 pods. Notice that out of all the requests there are 5 requests on the histogram that take around 10s, that's the cold start time cost to initially spawn the pods and download model to be ready to serve. The cold start may take longer(to pull the serving image) if the image is not cached on the node that the pod is scheduled on. $ kubectl get pods NAME READY STATUS RESTARTS AGE flowers-sample-default-7kqt6-deployment-75d577dcdb-sr5wd 3 /3 Running 0 42s flowers-sample-default-7kqt6-deployment-75d577dcdb-swnk5 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-t2njf 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-vdlp9 3 /3 Running 0 64s flowers-sample-default-7kqt6-deployment-75d577dcdb-vm58d 3 /3 Running 0 42s Check Dashboard \u00b6 View the Knative Serving Scaling dashboards (if configured). kubectl kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 InferenceService with target QPS \u00b6 Create the InferenceService \u00b6 Apply the same tensorflow example CR kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created Predict InferenceService with target QPS \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 50 qps. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -q 50 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0264 secs Slowest: 10 .8113 secs Fastest: 0 .0145 secs Average: 0 .0731 secs Requests/sec: 683 .5644 Total data: 5069675 bytes Size/request: 247 bytes Response time histogram: 0 .014 [ 1 ] | 1 .094 [ 20474 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .174 [ 0 ] | 3 .254 [ 0 ] | 4 .333 [ 0 ] | 5 .413 [ 0 ] | 6 .493 [ 0 ] | 7 .572 [ 0 ] | 8 .652 [ 0 ] | 9 .732 [ 0 ] | 10 .811 [ 50 ] | Latency distribution: 10 % in 0 .0284 secs 25 % in 0 .0334 secs 50 % in 0 .0408 secs 75 % in 0 .0527 secs 90 % in 0 .0765 secs 95 % in 0 .0949 secs 99 % in 0 .1334 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0001 secs, 0 .0145 secs, 10 .8113 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0196 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs resp wait: 0 .0728 secs, 0 .0144 secs, 10 .7688 secs resp read: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs Status code distribution: [ 200 ] 20525 responses Check the number of running pods now, we are loading the service with 50 requests per second, and from the dashboard you can see that it hits the average concurrency 10 and autoscaler tries scaling up to 10 pods. Check Dashboard \u00b6 View the Knative Serving Scaling dashboards (if configured). kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 Autoscaler calculates average concurrency over 60 second window so it takes a minute to stabilize at the desired concurrency level, however it also calculates the 6 second panic window and will enter into panic mode if that window reaches 2x target concurrency. From the dashboard you can see that it enters panic mode in which autoscaler operates on shorter and more sensitive window. Once the panic conditions are no longer met for 60 seconds, autoscaler will return back to 60 seconds stable window. Autoscaling on GPU! \u00b6 Autoscaling on GPU is hard with GPU metrics, however thanks to Knative's concurrency based autoscaler scaling on GPU is pretty easy and effective! Create the InferenceService with GPU resource \u00b6 Apply the tensorflow gpu example CR New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 Apply the autoscale-gpu.yaml . kubectl kubectl apply -f autoscale-gpu.yaml Predict InferenceService with concurrent requests \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 5 in-flight requests. MODEL_NAME = flowers-sample-gpu INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0152 secs Slowest: 9 .7581 secs Fastest: 0 .0142 secs Average: 0 .0350 secs Requests/sec: 142 .9942 Total data: 948532 bytes Size/request: 221 bytes Response time histogram: 0 .014 [ 1 ] | 0 .989 [ 4286 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 1 .963 [ 0 ] | 2 .937 [ 0 ] | 3 .912 [ 0 ] | 4 .886 [ 0 ] | 5 .861 [ 0 ] | 6 .835 [ 0 ] | 7 .809 [ 0 ] | 8 .784 [ 0 ] | 9 .758 [ 5 ] | Latency distribution: 10 % in 0 .0181 secs 25 % in 0 .0189 secs 50 % in 0 .0198 secs 75 % in 0 .0210 secs 90 % in 0 .0230 secs 95 % in 0 .0276 secs 99 % in 0 .0511 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0000 secs, 0 .0142 secs, 9 .7581 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0291 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0023 secs resp wait: 0 .0348 secs, 0 .0141 secs, 9 .7158 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4292 responses Autoscaling Customization \u00b6 Autoscaling with ContainerConcurrency \u00b6 ContainerConcurrency determines the number of simultaneous requests that can be processed by each replica of the InferenceService at any given time, it is a hard limit and if the concurrency reaches the hard limit surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale-custom.yaml . kubectl kubectl apply -f autoscale-custom.yaml Enable scale down to zero \u00b6 KServe by default sets minReplicas to 1, if you want to enable scaling down to zero especially for use cases like serving on GPUs you can set minReplicas to 0 so that the pods automatically scale down to zero when no traffic is received. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the scale-down-to-zero.yaml . kubectl kubectl apply -f scale-down-to-zero.yaml Autoscaling configuration at component level \u00b6 Autoscaling options can also be configured at the component level. This allows more flexibility in terms of the autoscaling configuration. In a typical deployment, transformers may require a different autoscaling configuration than a predictor. This feature allows the user to scale individual components as required. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Apply the autoscale-adv.yaml to create the Autoscale InferenceService. The default for scaleMetric is concurrency and possible values are concurrency , rps , cpu and memory .","title":"Inference Autoscaling"},{"location":"modelserving/autoscaling/autoscaling/#autoscale-inferenceservice-with-inference-workload","text":"","title":"Autoscale InferenceService with inference workload"},{"location":"modelserving/autoscaling/autoscaling/#inferenceservice-with-target-concurrency","text":"","title":"InferenceService with target concurrency"},{"location":"modelserving/autoscaling/autoscaling/#create-inferenceservice","text":"Apply the tensorflow example CR with scaling target set to 1. Annotation autoscaling.knative.dev/target is the soft limit rather than a strictly enforced limit, if there is sudden burst of the requests, this value can be exceeded. The scaleTarget and scaleMetric are introduced in version 0.9 of kserve and should be available in both new and old schema. This is the preferred way of defining autoscaling options. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale.yaml to create the Autoscale InferenceService. kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created","title":"Create InferenceService"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-concurrent-requests","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send traffic in 30 seconds spurts maintaining 5 in-flight requests. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0193 secs Slowest: 10 .1458 secs Fastest: 0 .0127 secs Average: 0 .0364 secs Requests/sec: 137 .4449 Total data: 1019122 bytes Size/request: 247 bytes Response time histogram: 0 .013 [ 1 ] | 1 .026 [ 4120 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .039 [ 0 ] | 3 .053 [ 0 ] | 4 .066 [ 0 ] | 5 .079 [ 0 ] | 6 .093 [ 0 ] | 7 .106 [ 0 ] | 8 .119 [ 0 ] | 9 .133 [ 0 ] | 10 .146 [ 5 ] | Latency distribution: 10 % in 0 .0178 secs 25 % in 0 .0188 secs 50 % in 0 .0199 secs 75 % in 0 .0210 secs 90 % in 0 .0231 secs 95 % in 0 .0328 secs 99 % in 0 .1501 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0002 secs, 0 .0127 secs, 10 .1458 secs DNS-lookup: 0 .0002 secs, 0 .0000 secs, 0 .1502 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0020 secs resp wait: 0 .0360 secs, 0 .0125 secs, 9 .9791 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4126 responses Check the number of running pods now, Kserve uses Knative Serving autoscaler which is based on the average number of in-flight requests per pod(concurrency). As the scaling target is set to 1 and we load the service with 5 concurrent requests, so the autoscaler tries scaling up to 5 pods. Notice that out of all the requests there are 5 requests on the histogram that take around 10s, that's the cold start time cost to initially spawn the pods and download model to be ready to serve. The cold start may take longer(to pull the serving image) if the image is not cached on the node that the pod is scheduled on. $ kubectl get pods NAME READY STATUS RESTARTS AGE flowers-sample-default-7kqt6-deployment-75d577dcdb-sr5wd 3 /3 Running 0 42s flowers-sample-default-7kqt6-deployment-75d577dcdb-swnk5 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-t2njf 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-vdlp9 3 /3 Running 0 64s flowers-sample-default-7kqt6-deployment-75d577dcdb-vm58d 3 /3 Running 0 42s","title":"Predict InferenceService with concurrent requests"},{"location":"modelserving/autoscaling/autoscaling/#check-dashboard","text":"View the Knative Serving Scaling dashboards (if configured). kubectl kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000","title":"Check Dashboard"},{"location":"modelserving/autoscaling/autoscaling/#inferenceservice-with-target-qps","text":"","title":"InferenceService with target QPS"},{"location":"modelserving/autoscaling/autoscaling/#create-the-inferenceservice","text":"Apply the same tensorflow example CR kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created","title":"Create the InferenceService"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-target-qps","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 50 qps. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -q 50 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0264 secs Slowest: 10 .8113 secs Fastest: 0 .0145 secs Average: 0 .0731 secs Requests/sec: 683 .5644 Total data: 5069675 bytes Size/request: 247 bytes Response time histogram: 0 .014 [ 1 ] | 1 .094 [ 20474 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .174 [ 0 ] | 3 .254 [ 0 ] | 4 .333 [ 0 ] | 5 .413 [ 0 ] | 6 .493 [ 0 ] | 7 .572 [ 0 ] | 8 .652 [ 0 ] | 9 .732 [ 0 ] | 10 .811 [ 50 ] | Latency distribution: 10 % in 0 .0284 secs 25 % in 0 .0334 secs 50 % in 0 .0408 secs 75 % in 0 .0527 secs 90 % in 0 .0765 secs 95 % in 0 .0949 secs 99 % in 0 .1334 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0001 secs, 0 .0145 secs, 10 .8113 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0196 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs resp wait: 0 .0728 secs, 0 .0144 secs, 10 .7688 secs resp read: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs Status code distribution: [ 200 ] 20525 responses Check the number of running pods now, we are loading the service with 50 requests per second, and from the dashboard you can see that it hits the average concurrency 10 and autoscaler tries scaling up to 10 pods.","title":"Predict InferenceService with target QPS"},{"location":"modelserving/autoscaling/autoscaling/#check-dashboard_1","text":"View the Knative Serving Scaling dashboards (if configured). kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 Autoscaler calculates average concurrency over 60 second window so it takes a minute to stabilize at the desired concurrency level, however it also calculates the 6 second panic window and will enter into panic mode if that window reaches 2x target concurrency. From the dashboard you can see that it enters panic mode in which autoscaler operates on shorter and more sensitive window. Once the panic conditions are no longer met for 60 seconds, autoscaler will return back to 60 seconds stable window.","title":"Check Dashboard"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-on-gpu","text":"Autoscaling on GPU is hard with GPU metrics, however thanks to Knative's concurrency based autoscaler scaling on GPU is pretty easy and effective!","title":"Autoscaling on GPU!"},{"location":"modelserving/autoscaling/autoscaling/#create-the-inferenceservice-with-gpu-resource","text":"Apply the tensorflow gpu example CR New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 Apply the autoscale-gpu.yaml . kubectl kubectl apply -f autoscale-gpu.yaml","title":"Create the InferenceService with GPU resource"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-concurrent-requests_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 5 in-flight requests. MODEL_NAME = flowers-sample-gpu INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0152 secs Slowest: 9 .7581 secs Fastest: 0 .0142 secs Average: 0 .0350 secs Requests/sec: 142 .9942 Total data: 948532 bytes Size/request: 221 bytes Response time histogram: 0 .014 [ 1 ] | 0 .989 [ 4286 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 1 .963 [ 0 ] | 2 .937 [ 0 ] | 3 .912 [ 0 ] | 4 .886 [ 0 ] | 5 .861 [ 0 ] | 6 .835 [ 0 ] | 7 .809 [ 0 ] | 8 .784 [ 0 ] | 9 .758 [ 5 ] | Latency distribution: 10 % in 0 .0181 secs 25 % in 0 .0189 secs 50 % in 0 .0198 secs 75 % in 0 .0210 secs 90 % in 0 .0230 secs 95 % in 0 .0276 secs 99 % in 0 .0511 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0000 secs, 0 .0142 secs, 9 .7581 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0291 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0023 secs resp wait: 0 .0348 secs, 0 .0141 secs, 9 .7158 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4292 responses","title":"Predict InferenceService with concurrent requests"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-customization","text":"","title":"Autoscaling Customization"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-with-containerconcurrency","text":"ContainerConcurrency determines the number of simultaneous requests that can be processed by each replica of the InferenceService at any given time, it is a hard limit and if the concurrency reaches the hard limit surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale-custom.yaml . kubectl kubectl apply -f autoscale-custom.yaml","title":"Autoscaling with ContainerConcurrency"},{"location":"modelserving/autoscaling/autoscaling/#enable-scale-down-to-zero","text":"KServe by default sets minReplicas to 1, if you want to enable scaling down to zero especially for use cases like serving on GPUs you can set minReplicas to 0 so that the pods automatically scale down to zero when no traffic is received. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the scale-down-to-zero.yaml . kubectl kubectl apply -f scale-down-to-zero.yaml","title":"Enable scale down to zero"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-configuration-at-component-level","text":"Autoscaling options can also be configured at the component level. This allows more flexibility in terms of the autoscaling configuration. In a typical deployment, transformers may require a different autoscaling configuration than a predictor. This feature allows the user to scale individual components as required. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Apply the autoscale-adv.yaml to create the Autoscale InferenceService. The default for scaleMetric is concurrency and possible values are concurrency , rps , cpu and memory .","title":"Autoscaling configuration at component level"},{"location":"modelserving/batcher/batcher/","text":"Inference Batcher \u00b6 This docs explains on how batch prediction for any ML frameworks (TensorFlow, PyTorch, ...) without decreasing the performance. This batcher is implemented in the KServe model agent sidecar, so the requests first hit the agent sidecar, when a batch prediction is triggered the request is then sent to the model server container for inference. We use webhook to inject the model agent container in the InferenceService pod to do the batching when batcher is enabled. We use go channels to transfer data between http request handler and batcher go routines. Currently we only implemented batching with KServe v1 HTTP protocol, gRPC is not supported yet. When the number of instances (For example, the number of pictures) reaches the maxBatchSize or the latency meets the maxLatency , a batch prediction will be triggered. Example \u00b6 We first create a pytorch predictor with a batcher. The maxLatency is set to a big value (500 milliseconds) to make us be able to observe the batching process. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 maxBatchSize : the max batch size for triggering a prediction. maxLatency : the max latency for triggering a prediction (In milliseconds). timeout : timeout of calling predictor service (In seconds). All of the bellowing fields have default values in the code. You can config them or not as you wish. maxBatchSize : 32. maxLatency : 500. timeout : 60. kubectl kubectl create -f pytorch-batcher.yaml We can now send requests to the pytorch model using hey. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 10s -c 5 -m POST -host \" ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -D ./input.json \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict\" The request will go to the model agent container first, the batcher in sidecar container batches the requests and send the inference request to the predictor container. Note If the interval of sending the two requests is less than maxLatency , the returned batchId will be the same. Expected Output Summary: Total: 10 .5361 secs Slowest: 0 .5759 secs Fastest: 0 .4983 secs Average: 0 .5265 secs Requests/sec: 9 .4912 Total data: 24100 bytes Size/request: 241 bytes Response time histogram: 0 .498 [ 1 ] | \u25a0 0 .506 [ 0 ] | 0 .514 [ 44 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .522 [ 21 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .529 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .537 [ 5 ] | \u25a0\u25a0\u25a0\u25a0\u25a0 0 .545 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .553 [ 0 ] | 0 .560 [ 7 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .568 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .576 [ 10 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 Latency distribution: 10 % in 0 .5100 secs 25 % in 0 .5118 secs 50 % in 0 .5149 secs 75 % in 0 .5406 secs 90 % in 0 .5706 secs 95 % in 0 .5733 secs 99 % in 0 .5759 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0004 secs, 0 .4983 secs, 0 .5759 secs DNS-lookup: 0 .0001 secs, 0 .0000 secs, 0 .0015 secs req write: 0 .0002 secs, 0 .0000 secs, 0 .0076 secs resp wait: 0 .5257 secs, 0 .4981 secs, 0 .5749 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0009 secs Status code distribution: [ 200 ] 100 responses","title":"Inference Batcher"},{"location":"modelserving/batcher/batcher/#inference-batcher","text":"This docs explains on how batch prediction for any ML frameworks (TensorFlow, PyTorch, ...) without decreasing the performance. This batcher is implemented in the KServe model agent sidecar, so the requests first hit the agent sidecar, when a batch prediction is triggered the request is then sent to the model server container for inference. We use webhook to inject the model agent container in the InferenceService pod to do the batching when batcher is enabled. We use go channels to transfer data between http request handler and batcher go routines. Currently we only implemented batching with KServe v1 HTTP protocol, gRPC is not supported yet. When the number of instances (For example, the number of pictures) reaches the maxBatchSize or the latency meets the maxLatency , a batch prediction will be triggered.","title":"Inference Batcher"},{"location":"modelserving/batcher/batcher/#example","text":"We first create a pytorch predictor with a batcher. The maxLatency is set to a big value (500 milliseconds) to make us be able to observe the batching process. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 maxBatchSize : the max batch size for triggering a prediction. maxLatency : the max latency for triggering a prediction (In milliseconds). timeout : timeout of calling predictor service (In seconds). All of the bellowing fields have default values in the code. You can config them or not as you wish. maxBatchSize : 32. maxLatency : 500. timeout : 60. kubectl kubectl create -f pytorch-batcher.yaml We can now send requests to the pytorch model using hey. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 10s -c 5 -m POST -host \" ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -D ./input.json \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict\" The request will go to the model agent container first, the batcher in sidecar container batches the requests and send the inference request to the predictor container. Note If the interval of sending the two requests is less than maxLatency , the returned batchId will be the same. Expected Output Summary: Total: 10 .5361 secs Slowest: 0 .5759 secs Fastest: 0 .4983 secs Average: 0 .5265 secs Requests/sec: 9 .4912 Total data: 24100 bytes Size/request: 241 bytes Response time histogram: 0 .498 [ 1 ] | \u25a0 0 .506 [ 0 ] | 0 .514 [ 44 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .522 [ 21 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .529 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .537 [ 5 ] | \u25a0\u25a0\u25a0\u25a0\u25a0 0 .545 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .553 [ 0 ] | 0 .560 [ 7 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .568 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .576 [ 10 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 Latency distribution: 10 % in 0 .5100 secs 25 % in 0 .5118 secs 50 % in 0 .5149 secs 75 % in 0 .5406 secs 90 % in 0 .5706 secs 95 % in 0 .5733 secs 99 % in 0 .5759 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0004 secs, 0 .4983 secs, 0 .5759 secs DNS-lookup: 0 .0001 secs, 0 .0000 secs, 0 .0015 secs req write: 0 .0002 secs, 0 .0000 secs, 0 .0076 secs resp wait: 0 .5257 secs, 0 .4981 secs, 0 .5749 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0009 secs Status code distribution: [ 200 ] 100 responses","title":"Example"},{"location":"modelserving/certificate/kserve/","text":"KServe with Self Signed Certificate Model Registry \u00b6 If you are using a model registry with a self-signed certificate, you must either skip ssl verify or apply the appropriate CA bundle to the storage-initializer to create a connection with the registry. This document explains three methods that can be used in KServe, described below: Configure CA bundle for storage-initializer Global configuration Namespace scope configuration(Using storage-config Secret) json annotation Skip SSL Verification (NOTE) This is only available for RawDeployment and ServerlessDeployment . For modelmesh, you should add ca bundle content into certificate parameter in storage-config Configure CA bundle for storage-initializer \u00b6 Global Configuration \u00b6 KServe use inferenceservice-config ConfigMap for default configuration. If you want to add cabundle cert for every inference service, you can set caBundleConfigMapName in the ConfigMap. Before updating the ConfigMap, you have to create a ConfigMap for CA bundle certificate in the namespace that KServe controller is running and the data key in the ConfigMap must be cabundle.crt . Create CA ConfigMap with the CA bundle cert kubectl create configmap cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: cabundle namespace: kserve Update inferenceservice-config ConfigMap storageInitializer: |- { ... \"caBundleConfigMapName\": \"cabundle\", ... } After you update this configuration, please restart KServe controller pod to pick up the change. When you create a inference service, then the ca bundle will be copied to your user namespace and it will be attached to the storage-initializer container. Using storage-config Secret \u00b6 If you want to apply the cabundle only to a specific inferenceservice, you can use a specific annotation or variable( cabundle_configmap ) on the storage-config Secret used by the inferenceservice. In this case, you have to create the cabundle ConfigMap in the user namespace before you create the inferenceservice. Create a ConfigMap with the cabundle cert kubectl create configmap local-cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: local-cabundle namespace: kserve-demo Add an annotation serving.kserve.io/s3-cabundle-configmap to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-cabundle-configmap: local-cabundle ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable cabundle_configmap to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", .... \"cabundle_configmap\": \"local-cabundle\" } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Skip SSL Verification \u00b6 For testing purposes or when there is no cabundle, you can easily create an SSL connection by disabling SSL verification. This can also be used by adding an annotation or setting a variable in secret-config Secret. Add an annotation( serving.kserve.io/s3-verifyssl ) to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-verifyssl: \"0\" # 1 is true, 0 is false ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable ( verify_ssl ) to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", ... \"verify_ssl\": \"0\" # 1 is true, 0 is false (You can set True/true/False/false too) } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Full Demo Scripts","title":"CA Certificate"},{"location":"modelserving/certificate/kserve/#kserve-with-self-signed-certificate-model-registry","text":"If you are using a model registry with a self-signed certificate, you must either skip ssl verify or apply the appropriate CA bundle to the storage-initializer to create a connection with the registry. This document explains three methods that can be used in KServe, described below: Configure CA bundle for storage-initializer Global configuration Namespace scope configuration(Using storage-config Secret) json annotation Skip SSL Verification (NOTE) This is only available for RawDeployment and ServerlessDeployment . For modelmesh, you should add ca bundle content into certificate parameter in storage-config","title":"KServe with Self Signed Certificate Model Registry"},{"location":"modelserving/certificate/kserve/#configure-ca-bundle-for-storage-initializer","text":"","title":"Configure CA bundle for storage-initializer"},{"location":"modelserving/certificate/kserve/#global-configuration","text":"KServe use inferenceservice-config ConfigMap for default configuration. If you want to add cabundle cert for every inference service, you can set caBundleConfigMapName in the ConfigMap. Before updating the ConfigMap, you have to create a ConfigMap for CA bundle certificate in the namespace that KServe controller is running and the data key in the ConfigMap must be cabundle.crt . Create CA ConfigMap with the CA bundle cert kubectl create configmap cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: cabundle namespace: kserve Update inferenceservice-config ConfigMap storageInitializer: |- { ... \"caBundleConfigMapName\": \"cabundle\", ... } After you update this configuration, please restart KServe controller pod to pick up the change. When you create a inference service, then the ca bundle will be copied to your user namespace and it will be attached to the storage-initializer container.","title":"Global Configuration"},{"location":"modelserving/certificate/kserve/#using-storage-config-secret","text":"If you want to apply the cabundle only to a specific inferenceservice, you can use a specific annotation or variable( cabundle_configmap ) on the storage-config Secret used by the inferenceservice. In this case, you have to create the cabundle ConfigMap in the user namespace before you create the inferenceservice. Create a ConfigMap with the cabundle cert kubectl create configmap local-cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: local-cabundle namespace: kserve-demo Add an annotation serving.kserve.io/s3-cabundle-configmap to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-cabundle-configmap: local-cabundle ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable cabundle_configmap to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", .... \"cabundle_configmap\": \"local-cabundle\" } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque","title":"Using storage-config Secret"},{"location":"modelserving/certificate/kserve/#skip-ssl-verification","text":"For testing purposes or when there is no cabundle, you can easily create an SSL connection by disabling SSL verification. This can also be used by adding an annotation or setting a variable in secret-config Secret. Add an annotation( serving.kserve.io/s3-verifyssl ) to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-verifyssl: \"0\" # 1 is true, 0 is false ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable ( verify_ssl ) to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", ... \"verify_ssl\": \"0\" # 1 is true, 0 is false (You can set True/true/False/false too) } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Full Demo Scripts","title":"Skip SSL Verification"},{"location":"modelserving/data_plane/binary_tensor_data_extension/","text":"Binary Tensor Data Extension \u00b6 The Binary Tensor Data Extension allows clients to send and receive tensor data in a binary format in the body of an HTTP/REST request. This extension is particularly useful for sending and receiving FP16 data as there is no specific data type for a 16-bit float type in the Open Inference Protocol and large tensors for high-throughput scenarios. Overview \u00b6 Tensor data represented as binary data is organized in little-endian byte order, row major, without stride or padding between elements. All tensor data types are representable as binary data in the native size of the data type. For BOOL type element true is a single byte with value 1 and false is a single byte with value 0. For BYTES type an element is represented by a 4-byte unsigned integer giving the length followed by the actual bytes. The binary data for a tensor is delivered in the HTTP body after the JSON object (see Examples). The binary tensor data extension uses parameters to indicate that an input or output tensor is communicated as binary data. The binary_data_size parameter is used in $request_input and $response_output to indicate that the input or output tensor is communicated as binary data: \"binary_data_size\" : int64 parameter indicating the size of the tensor binary data, in bytes. The binary_data parameter is used in $request_output to indicate that the output should be returned from KServe runtime as binary data. \"binary_data\" : bool parameter that is true if the output should be returned as binary data and false (or not given) if the tensor should be returned as JSON. The binary_data_output parameter is used in $inference_request to indicate that all outputs should be returned from KServe runtime as binary data, unless overridden by \"binary_data\" on a specific output. \"binary_data_output\" : bool parameter that is true if all outputs should be returned as binary data and false (or not given) if the outputs should be returned as JSON. If \"binary_data\" is specified on an output it overrides this setting. When one or more tensors are communicated as binary data, the HTTP body of the request or response will contain the JSON inference request or response object followed by the binary tensor data in the same order as the order of the input or output tensors are specified in the JSON. If any binary data is present in the request or response the Inference-Header-Content-Length header must be provided to give the length of the JSON object, and Content-Length continues to give the full body length (as HTTP requires). Examples \u00b6 Sending and Receiving Binary Data \u00b6 For the following request the input tensors input0 and input2 are sent as binary data while input1 is sent as non-binary data. Note that the input0 and input2 input tensors have a parameter binary_data_size which represents the size of the binary data. The output tensor output0 must be returned as binary data as that is what is requested by setting the binary_data parameter to true. Also note that the size of the JSON part is provided in the Inference-Header-Content-Length and the total size of the binary data is reflected in the Content-Length header. POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/octet-stream Inference-Header-Content-Length: <xx> # Json length Content-Length: <xx+19> # Json length + binary data length (In this case 16 + 3 = 19) { \"model_name\" : \"mymodel\" , \"inputs\" : [ { \"name\" : \"input0\" , \"shape\" : [ 2 , 2 ] , \"datatype\" : \"FP16\" , \"parameters\" : { \"binary_data_size\" : 16 } } , { \"name\" : \"input1\" , \"shape\" : [ 2 , 2 ] , \"datatype\" : \"UINT32\" , \"data\" : [[ 1 , 2 ] , [ 3 , 4 ]] } , { \"name\" : \"input2\" , \"shape\" : [ 3 ] , \"datatype\" : \"BOOL\" , \"parameters\" : { \"binary_data_size\" : 3 } } ] , \"outputs\" : [ { \"name\" : \"output0\" , \"parameters\" : { \"binary_data\" : true } } , { \"name\" : \"output1\" } ] } < 16 bytes of data for input0 tensor> < 3 bytes of data for input2 tensor> Assuming the model returns a [ 3, 2 ] tensor of data type FP16 and a [2, 2] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/octet-stream Inference-Header-Content-Length: <yy> # Json length Content-Length: <yy+16> # Json length + binary data length (In this case 16) { \"outputs\" : [ { \"name\" : \"output0\" , \"shape\" : [ 3 , 2 ] , \"datatype\" : \"FP16\" , \"parameters\" : { \"binary_data_size\" : 16 } } , { \"name\" : \"output1\" , \"shape\" : [ 2 , 2 ] , \"datatype\" : \"FP32\" , \"data\" : [[ 1 .203, 5 .403 ] , [ 3 .434, 34 .234 ]] } ] } < 16 bytes of data for output0 tensor> Inference Client Example from kserve import ModelServer , InferenceRESTClient , InferRequest , InferInput from kserve.protocol.infer_type import RequestedOutput from kserve.inference_client import RESTConfig fp16_data = np . array ([[ 1.1 , 2.22 ], [ 3.345 , 4.34343 ]], dtype = np . float16 ) uint32_data = np . array ([[ 1 , 2 ], [ 3 , 4 ]], dtype = np . uint32 ) bool_data = np . array ([ True , False , True ], dtype = np . bool ) # Create input tensor with binary data input_0 = InferInput ( name = \"input_0\" , datatype = \"FP16\" , shape = [ 2 , 2 ]) input_0 . set_data_from_numpy ( fp16_data , binary_data = True ) input_1 = InferInput ( name = \"input_1\" , datatype = \"UINT32\" , shape = [ 2 , 2 ]) input_1 . set_data_from_numpy ( uint32_data , binary_data = False ) input_2 = InferInput ( name = \"input_2\" , datatype = \"BOOL\" , shape = [ 3 ]) input_2 . set_data_from_numpy ( bool_data , binary_data = True ) # Create request output output_0 = RequestedOutput ( name = \"output_0\" , binary_data = True ) output_1 = RequestedOutput ( name = \"output_1\" , binary_data = False ) # Create inference request infer_request = InferRequest ( model_name = \"mymodel\" , request_id = \"2ja0ls9j1309\" , infer_inputs = [ input_0 , input_1 , input_2 ], requested_outputs = [ output_0 , output_1 ], ) # Create the REST client config = RESTConfig ( verbose = True , protocol = \"v2\" ) rest_client = InferenceRESTClient ( config = config ) # Send the request infer_response = await rest_client . infer ( \"http://localhost:8000\" , model_name = \"TestModel\" , data = infer_request , headers = { \"Host\" : \"test-server.com\" }, timeout = 2 , ) # Read the binary data from the response output_0 = infer_response . outputs [ 0 ] fp16_output = output_0 . as_numpy () # Read the non-binary data from the response output_1 = infer_response . outputs [ 1 ] fp32_output = output_1 . data # This will return the data as a list fp32_output_arr = output_1 . as_numpy () Requesting All The Outputs To Be In Binary Format \u00b6 For the following request, binary_data_output is set to true to receive all the outputs as binary data. Note that the binary_data_output is set in the $inference_request parameters field, not in the $inference_input parameters field. This parameter can be overridden for a specific output by setting binary_data parameter to false in the $request_output . POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/json Content-Length: 75 { \"model_name\" : \"my_model\" , \"inputs\" : [ { \"name\" : \"input_tensor\" , \"datatype\" : \"FP32\" , \"shape\" : [ 1 , 2 ] , \"data\" : [[ 32 .045, 399 .043 ]] , } ] , \"parameters\" : { \"binary_data_output\" : true } } Assuming the model returns a [ 3, 2 ] tensor of data type FP16 and a [2, 2] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/octet-stream Inference-Header-Content-Length: <yy> # Json length Content-Length: <yy+48> # Json length + binary data length (In this case 16 + 32) { \"outputs\" : [ { \"name\" : \"output_tensor0\" , \"shape\" : [ 3 , 2 ] , \"datatype\" : \"FP16\" , \"parameters\" : { \"binary_data_size\" : 16 } } , { \"name\" : \"output_tensor1\" , \"shape\" : [ 2 , 2 ] , \"datatype\" : \"FP32\" , \"parameters\" : { \"binary_data_size\" : 32 } } ] } < 16 bytes of data for output_tensor0 tensor> < 32 bytes of data for output_tensor1 tensor> Inference Client Example from kserve import ModelServer , InferenceRESTClient , InferRequest , InferInput from kserve.protocol.infer_type import RequestedOutput from kserve.inference_client import RESTConfig fp32_data = np . array ([[ 32.045 , 399.043 ]], dtype = np . float32 ) # Create the input tensor input_0 = InferInput ( name = \"input_0\" , datatype = \"FP32\" , shape = [ 1 , 2 ]) input_0 . set_data_from_numpy ( fp16_data , binary_data = False ) # Create inference request with binary_data_output set to True infer_request = InferRequest ( model_name = \"mymodel\" , request_id = \"2ja0ls9j1309\" , infer_inputs = [ input_0 ], parameters = { \"binary_data_output\" : True } ) # Create the REST client config = RESTConfig ( verbose = True , protocol = \"v2\" ) rest_client = InferenceRESTClient ( config = config ) # Send the request infer_response = await rest_client . infer ( \"http://localhost:8000\" , model_name = \"TestModel\" , data = infer_request , headers = { \"Host\" : \"test-server.com\" }, timeout = 2 , ) # Read the binary data from the response output_0 = infer_response . outputs [ 0 ] fp16_output = output_0 . as_numpy () output_1 = infer_response . outputs [ 1 ] fp32_output_arr = output_1 . as_numpy ()","title":"Binary Tensor Data Extension"},{"location":"modelserving/data_plane/binary_tensor_data_extension/#binary-tensor-data-extension","text":"The Binary Tensor Data Extension allows clients to send and receive tensor data in a binary format in the body of an HTTP/REST request. This extension is particularly useful for sending and receiving FP16 data as there is no specific data type for a 16-bit float type in the Open Inference Protocol and large tensors for high-throughput scenarios.","title":"Binary Tensor Data Extension"},{"location":"modelserving/data_plane/binary_tensor_data_extension/#overview","text":"Tensor data represented as binary data is organized in little-endian byte order, row major, without stride or padding between elements. All tensor data types are representable as binary data in the native size of the data type. For BOOL type element true is a single byte with value 1 and false is a single byte with value 0. For BYTES type an element is represented by a 4-byte unsigned integer giving the length followed by the actual bytes. The binary data for a tensor is delivered in the HTTP body after the JSON object (see Examples). The binary tensor data extension uses parameters to indicate that an input or output tensor is communicated as binary data. The binary_data_size parameter is used in $request_input and $response_output to indicate that the input or output tensor is communicated as binary data: \"binary_data_size\" : int64 parameter indicating the size of the tensor binary data, in bytes. The binary_data parameter is used in $request_output to indicate that the output should be returned from KServe runtime as binary data. \"binary_data\" : bool parameter that is true if the output should be returned as binary data and false (or not given) if the tensor should be returned as JSON. The binary_data_output parameter is used in $inference_request to indicate that all outputs should be returned from KServe runtime as binary data, unless overridden by \"binary_data\" on a specific output. \"binary_data_output\" : bool parameter that is true if all outputs should be returned as binary data and false (or not given) if the outputs should be returned as JSON. If \"binary_data\" is specified on an output it overrides this setting. When one or more tensors are communicated as binary data, the HTTP body of the request or response will contain the JSON inference request or response object followed by the binary tensor data in the same order as the order of the input or output tensors are specified in the JSON. If any binary data is present in the request or response the Inference-Header-Content-Length header must be provided to give the length of the JSON object, and Content-Length continues to give the full body length (as HTTP requires).","title":"Overview"},{"location":"modelserving/data_plane/binary_tensor_data_extension/#examples","text":"","title":"Examples"},{"location":"modelserving/data_plane/binary_tensor_data_extension/#sending-and-receiving-binary-data","text":"For the following request the input tensors input0 and input2 are sent as binary data while input1 is sent as non-binary data. Note that the input0 and input2 input tensors have a parameter binary_data_size which represents the size of the binary data. The output tensor output0 must be returned as binary data as that is what is requested by setting the binary_data parameter to true. Also note that the size of the JSON part is provided in the Inference-Header-Content-Length and the total size of the binary data is reflected in the Content-Length header. POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/octet-stream Inference-Header-Content-Length: <xx> # Json length Content-Length: <xx+19> # Json length + binary data length (In this case 16 + 3 = 19) { \"model_name\" : \"mymodel\" , \"inputs\" : [ { \"name\" : \"input0\" , \"shape\" : [ 2 , 2 ] , \"datatype\" : \"FP16\" , \"parameters\" : { \"binary_data_size\" : 16 } } , { \"name\" : \"input1\" , \"shape\" : [ 2 , 2 ] , \"datatype\" : \"UINT32\" , \"data\" : [[ 1 , 2 ] , [ 3 , 4 ]] } , { \"name\" : \"input2\" , \"shape\" : [ 3 ] , \"datatype\" : \"BOOL\" , \"parameters\" : { \"binary_data_size\" : 3 } } ] , \"outputs\" : [ { \"name\" : \"output0\" , \"parameters\" : { \"binary_data\" : true } } , { \"name\" : \"output1\" } ] } < 16 bytes of data for input0 tensor> < 3 bytes of data for input2 tensor> Assuming the model returns a [ 3, 2 ] tensor of data type FP16 and a [2, 2] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/octet-stream Inference-Header-Content-Length: <yy> # Json length Content-Length: <yy+16> # Json length + binary data length (In this case 16) { \"outputs\" : [ { \"name\" : \"output0\" , \"shape\" : [ 3 , 2 ] , \"datatype\" : \"FP16\" , \"parameters\" : { \"binary_data_size\" : 16 } } , { \"name\" : \"output1\" , \"shape\" : [ 2 , 2 ] , \"datatype\" : \"FP32\" , \"data\" : [[ 1 .203, 5 .403 ] , [ 3 .434, 34 .234 ]] } ] } < 16 bytes of data for output0 tensor> Inference Client Example from kserve import ModelServer , InferenceRESTClient , InferRequest , InferInput from kserve.protocol.infer_type import RequestedOutput from kserve.inference_client import RESTConfig fp16_data = np . array ([[ 1.1 , 2.22 ], [ 3.345 , 4.34343 ]], dtype = np . float16 ) uint32_data = np . array ([[ 1 , 2 ], [ 3 , 4 ]], dtype = np . uint32 ) bool_data = np . array ([ True , False , True ], dtype = np . bool ) # Create input tensor with binary data input_0 = InferInput ( name = \"input_0\" , datatype = \"FP16\" , shape = [ 2 , 2 ]) input_0 . set_data_from_numpy ( fp16_data , binary_data = True ) input_1 = InferInput ( name = \"input_1\" , datatype = \"UINT32\" , shape = [ 2 , 2 ]) input_1 . set_data_from_numpy ( uint32_data , binary_data = False ) input_2 = InferInput ( name = \"input_2\" , datatype = \"BOOL\" , shape = [ 3 ]) input_2 . set_data_from_numpy ( bool_data , binary_data = True ) # Create request output output_0 = RequestedOutput ( name = \"output_0\" , binary_data = True ) output_1 = RequestedOutput ( name = \"output_1\" , binary_data = False ) # Create inference request infer_request = InferRequest ( model_name = \"mymodel\" , request_id = \"2ja0ls9j1309\" , infer_inputs = [ input_0 , input_1 , input_2 ], requested_outputs = [ output_0 , output_1 ], ) # Create the REST client config = RESTConfig ( verbose = True , protocol = \"v2\" ) rest_client = InferenceRESTClient ( config = config ) # Send the request infer_response = await rest_client . infer ( \"http://localhost:8000\" , model_name = \"TestModel\" , data = infer_request , headers = { \"Host\" : \"test-server.com\" }, timeout = 2 , ) # Read the binary data from the response output_0 = infer_response . outputs [ 0 ] fp16_output = output_0 . as_numpy () # Read the non-binary data from the response output_1 = infer_response . outputs [ 1 ] fp32_output = output_1 . data # This will return the data as a list fp32_output_arr = output_1 . as_numpy ()","title":"Sending and Receiving Binary Data"},{"location":"modelserving/data_plane/binary_tensor_data_extension/#requesting-all-the-outputs-to-be-in-binary-format","text":"For the following request, binary_data_output is set to true to receive all the outputs as binary data. Note that the binary_data_output is set in the $inference_request parameters field, not in the $inference_input parameters field. This parameter can be overridden for a specific output by setting binary_data parameter to false in the $request_output . POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/json Content-Length: 75 { \"model_name\" : \"my_model\" , \"inputs\" : [ { \"name\" : \"input_tensor\" , \"datatype\" : \"FP32\" , \"shape\" : [ 1 , 2 ] , \"data\" : [[ 32 .045, 399 .043 ]] , } ] , \"parameters\" : { \"binary_data_output\" : true } } Assuming the model returns a [ 3, 2 ] tensor of data type FP16 and a [2, 2] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/octet-stream Inference-Header-Content-Length: <yy> # Json length Content-Length: <yy+48> # Json length + binary data length (In this case 16 + 32) { \"outputs\" : [ { \"name\" : \"output_tensor0\" , \"shape\" : [ 3 , 2 ] , \"datatype\" : \"FP16\" , \"parameters\" : { \"binary_data_size\" : 16 } } , { \"name\" : \"output_tensor1\" , \"shape\" : [ 2 , 2 ] , \"datatype\" : \"FP32\" , \"parameters\" : { \"binary_data_size\" : 32 } } ] } < 16 bytes of data for output_tensor0 tensor> < 32 bytes of data for output_tensor1 tensor> Inference Client Example from kserve import ModelServer , InferenceRESTClient , InferRequest , InferInput from kserve.protocol.infer_type import RequestedOutput from kserve.inference_client import RESTConfig fp32_data = np . array ([[ 32.045 , 399.043 ]], dtype = np . float32 ) # Create the input tensor input_0 = InferInput ( name = \"input_0\" , datatype = \"FP32\" , shape = [ 1 , 2 ]) input_0 . set_data_from_numpy ( fp16_data , binary_data = False ) # Create inference request with binary_data_output set to True infer_request = InferRequest ( model_name = \"mymodel\" , request_id = \"2ja0ls9j1309\" , infer_inputs = [ input_0 ], parameters = { \"binary_data_output\" : True } ) # Create the REST client config = RESTConfig ( verbose = True , protocol = \"v2\" ) rest_client = InferenceRESTClient ( config = config ) # Send the request infer_response = await rest_client . infer ( \"http://localhost:8000\" , model_name = \"TestModel\" , data = infer_request , headers = { \"Host\" : \"test-server.com\" }, timeout = 2 , ) # Read the binary data from the response output_0 = infer_response . outputs [ 0 ] fp16_output = output_0 . as_numpy () output_1 = infer_response . outputs [ 1 ] fp32_output_arr = output_1 . as_numpy ()","title":"Requesting All The Outputs To Be In Binary Format"},{"location":"modelserving/data_plane/data_plane/","text":"Data Plane \u00b6 The InferenceService Data Plane architecture consists of a static graph of components which coordinate requests for a single model. Advanced features such as Ensembling, A/B testing, and Multi-Arm-Bandits should compose InferenceServices together. Introduction \u00b6 KServe's data plane protocol introduces an inference API that is independent of any specific ML/DL framework and model server. This allows for quick iterations and consistency across Inference Services and supports both easy-to-use and high-performance use cases. By implementing this protocol both inference clients and servers will increase their utility and portability by operating seamlessly on platforms that have standardized around this API. Kserve's inference protocol is endorsed by NVIDIA Triton Inference Server, TensorFlow Serving, and TorchServe. Note: Protocol V2 uses /infer instead of :predict Concepts \u00b6 Component : Each endpoint is composed of multiple components: \"predictor\", \"explainer\", and \"transformer\". The only required component is the predictor, which is the core of the system. As KServe evolves, we plan to increase the number of supported components to enable use cases like Outlier Detection. Predictor : The predictor is the workhorse of the InferenceService. It is simply a model and a model server that makes it available at a network endpoint. Explainer : The explainer enables an optional alternate data plane that provides model explanations in addition to predictions. Users may define their own explanation container, which configures with relevant environment variables like prediction endpoint. For common use cases, KServe provides out-of-the-box explainers like Alibi. Transformer : The transformer enables users to define a pre and post processing step before the prediction and explanation workflows. Like the explainer, it is configured with relevant environment variables too. For common use cases, KServe provides out-of-the-box transformers like Feast. Data Plane V1 & V2 \u00b6 KServe supports two versions of its data plane, V1 and V2. V1 protocol offers a standard prediction workflow with HTTP/REST. The second version of the data-plane protocol addresses several issues found with the V1 data-plane protocol, including performance and generality across a large number of model frameworks and servers. Protocol V2 expands the capabilities of V1 by adding gRPC APIs. Main changes \u00b6 V2 does not currently support the explain endpoint V2 added Server Readiness/Liveness/Metadata endpoints V2 endpoint paths contain / instead of : V2 renamed :predict endpoint to /infer V2 allows for model versions in the request path (optional) V1 APIs \u00b6 API Verb Path List Models GET /v1/models Model Ready GET /v1/models/<model_name> Predict POST /v1/models/<model_name>:predict Explain POST /v1/models/<model_name>:explain V2 APIs \u00b6 API Verb Path Inference POST v2/models/<model_name>[/versions/<model_version>]/infer Model Metadata GET v2/models/<model_name>[/versions/<model_version>] Server Readiness GET v2/health/ready Server Liveness GET v2/health/live Server Metadata GET v2 Model Readiness GET v2/models/<model_name>[/versions/ ]/ready ** path contents in [] are optional Please see V1 Protocol and V2 Protocol documentation for more information.","title":"Model Serving Data Plane"},{"location":"modelserving/data_plane/data_plane/#data-plane","text":"The InferenceService Data Plane architecture consists of a static graph of components which coordinate requests for a single model. Advanced features such as Ensembling, A/B testing, and Multi-Arm-Bandits should compose InferenceServices together.","title":"Data Plane"},{"location":"modelserving/data_plane/data_plane/#introduction","text":"KServe's data plane protocol introduces an inference API that is independent of any specific ML/DL framework and model server. This allows for quick iterations and consistency across Inference Services and supports both easy-to-use and high-performance use cases. By implementing this protocol both inference clients and servers will increase their utility and portability by operating seamlessly on platforms that have standardized around this API. Kserve's inference protocol is endorsed by NVIDIA Triton Inference Server, TensorFlow Serving, and TorchServe. Note: Protocol V2 uses /infer instead of :predict","title":"Introduction"},{"location":"modelserving/data_plane/data_plane/#concepts","text":"Component : Each endpoint is composed of multiple components: \"predictor\", \"explainer\", and \"transformer\". The only required component is the predictor, which is the core of the system. As KServe evolves, we plan to increase the number of supported components to enable use cases like Outlier Detection. Predictor : The predictor is the workhorse of the InferenceService. It is simply a model and a model server that makes it available at a network endpoint. Explainer : The explainer enables an optional alternate data plane that provides model explanations in addition to predictions. Users may define their own explanation container, which configures with relevant environment variables like prediction endpoint. For common use cases, KServe provides out-of-the-box explainers like Alibi. Transformer : The transformer enables users to define a pre and post processing step before the prediction and explanation workflows. Like the explainer, it is configured with relevant environment variables too. For common use cases, KServe provides out-of-the-box transformers like Feast.","title":"Concepts"},{"location":"modelserving/data_plane/data_plane/#data-plane-v1-v2","text":"KServe supports two versions of its data plane, V1 and V2. V1 protocol offers a standard prediction workflow with HTTP/REST. The second version of the data-plane protocol addresses several issues found with the V1 data-plane protocol, including performance and generality across a large number of model frameworks and servers. Protocol V2 expands the capabilities of V1 by adding gRPC APIs.","title":"Data Plane V1 &amp; V2"},{"location":"modelserving/data_plane/data_plane/#main-changes","text":"V2 does not currently support the explain endpoint V2 added Server Readiness/Liveness/Metadata endpoints V2 endpoint paths contain / instead of : V2 renamed :predict endpoint to /infer V2 allows for model versions in the request path (optional)","title":"Main changes"},{"location":"modelserving/data_plane/data_plane/#v1-apis","text":"API Verb Path List Models GET /v1/models Model Ready GET /v1/models/<model_name> Predict POST /v1/models/<model_name>:predict Explain POST /v1/models/<model_name>:explain","title":"V1 APIs"},{"location":"modelserving/data_plane/data_plane/#v2-apis","text":"API Verb Path Inference POST v2/models/<model_name>[/versions/<model_version>]/infer Model Metadata GET v2/models/<model_name>[/versions/<model_version>] Server Readiness GET v2/health/ready Server Liveness GET v2/health/live Server Metadata GET v2 Model Readiness GET v2/models/<model_name>[/versions/ ]/ready ** path contents in [] are optional Please see V1 Protocol and V2 Protocol documentation for more information.","title":"V2 APIs"},{"location":"modelserving/data_plane/v1_protocol/","text":"Data Plane (V1) \u00b6 KServe's V1 protocol offers a standardized prediction workflow across all model frameworks. This protocol version is still supported, but it is recommended that users migrate to the V2 protocol for better performance and standardization among serving runtimes. However, if a use case requires a more flexible schema than protocol v2 provides, v1 protocol is still an option. API Verb Path Request Payload Response Payload List Models GET /v1/models {\"models\": [<model_name>]} Model Ready GET /v1/models/<model_name> {\"name\": <model_name>,\"ready\": $bool} Predict POST /v1/models/<model_name>:predict {\"instances\": []} ** {\"predictions\": []} Explain POST /v1/models/<model_name>:explain {\"instances\": []} ** {\"predictions\": [], \"explanations\": []} ** = payload is optional Note: The response payload in V1 protocol is not strictly enforced. A custom server can define and return its own response payload. We encourage using the KServe defined response payload for consistency. API Definitions \u00b6 API Definition Predict The \"predict\" API performs inference on a model. The response is the prediction result. All InferenceServices speak the Tensorflow V1 HTTP API . Explain The \"explain\" API is an optional component that provides model explanations in addition to predictions. The standardized explainer interface is identical to the Tensorflow V1 HTTP API with the addition of an \":explain\" verb. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. If the model(s) is downloaded and ready to serve requests, the model ready endpoint returns the list of accessible (s). List Models The \"models\" API exposes a list of models in the model registry.","title":"V1 Inference Protocol"},{"location":"modelserving/data_plane/v1_protocol/#data-plane-v1","text":"KServe's V1 protocol offers a standardized prediction workflow across all model frameworks. This protocol version is still supported, but it is recommended that users migrate to the V2 protocol for better performance and standardization among serving runtimes. However, if a use case requires a more flexible schema than protocol v2 provides, v1 protocol is still an option. API Verb Path Request Payload Response Payload List Models GET /v1/models {\"models\": [<model_name>]} Model Ready GET /v1/models/<model_name> {\"name\": <model_name>,\"ready\": $bool} Predict POST /v1/models/<model_name>:predict {\"instances\": []} ** {\"predictions\": []} Explain POST /v1/models/<model_name>:explain {\"instances\": []} ** {\"predictions\": [], \"explanations\": []} ** = payload is optional Note: The response payload in V1 protocol is not strictly enforced. A custom server can define and return its own response payload. We encourage using the KServe defined response payload for consistency.","title":"Data Plane (V1)"},{"location":"modelserving/data_plane/v1_protocol/#api-definitions","text":"API Definition Predict The \"predict\" API performs inference on a model. The response is the prediction result. All InferenceServices speak the Tensorflow V1 HTTP API . Explain The \"explain\" API is an optional component that provides model explanations in addition to predictions. The standardized explainer interface is identical to the Tensorflow V1 HTTP API with the addition of an \":explain\" verb. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. If the model(s) is downloaded and ready to serve requests, the model ready endpoint returns the list of accessible (s). List Models The \"models\" API exposes a list of models in the model registry.","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/","text":"Open Inference Protocol (V2 Inference Protocol) \u00b6 For an inference server to be compliant with this protocol the server must implement the health, metadata, and inference V2 APIs . Optional features that are explicitly noted are not required. A compliant inference server may choose to implement the HTTP/REST API and/or the GRPC API . Check the model serving runtime table / the protocolVersion field in the runtime YAML to ensure V2 protocol is supported for model serving runtime that you are using. Note: For all API descriptions on this page, all strings in all contexts are case-sensitive. The V2 protocol supports an extension mechanism as a required part of the API, but this document does not propose any specific extensions. Any specific extensions will be proposed separately. Note on changes between V1 & V2 \u00b6 V2 protocol does not currently support the explain endpoint like V1 protocol does. If this is a feature you wish to have in the V2 protocol, please submit a github issue . HTTP/REST \u00b6 The HTTP/REST API uses JSON because it is widely supported and language independent. In all JSON schemas shown in this document $number, $string, $boolean, $object and $array refer to the fundamental JSON types. #optional indicates an optional JSON field. See also: The HTTP/REST endpoints are defined in rest_predict_v2.yaml API Verb Path Request Payload Response Payload Inference POST v2/models/ [/versions/<model_version>]/infer $inference_request $inference_response Model Metadata GET v2/models/<model_name>[/versions/<model_version>] $metadata_model_response Server Ready GET v2/health/ready $ready_server_response Server Live GET v2/health/live $live_server_response Server Metadata GET v2 $metadata_server_response Model Ready GET v2/models/<model_name>[/versions/ ]/ready $ready_model_response ** path contents in [] are optional For more information regarding payload contents, see Payload Contents . The versions portion of the Path URLs (in [] ) is shown as optional to allow implementations that don\u2019t support versioning or for cases when the user does not want to specify a specific model version (in which case the server will choose a version based on its own policies). For example, if a model does not implement a version, the Model Metadata request path could look like v2/model/my_model . If the model has been configured to implement a version, the request path could look something like v2/models/my_model/versions/v10 , where the version of the model is v10. API Definitions \u00b6 API Definition Inference The /infer endpoint performs inference on a model. The response is the prediction result. Model Metadata The \"model metadata\" API is a per-model endpoint that returns details about the model passed in the path. Server Ready The \u201cserver ready\u201d health API indicates if all the models are ready for inferencing. The \u201cserver ready\u201d health API can be used directly to implement the Kubernetes readinessProbe Server Live The \u201cserver live\u201d health API indicates if the inference server is able to receive and respond to metadata and inference requests. The \u201cserver live\u201d API can be used directly to implement the Kubernetes livenessProbe. Server Metadata The \"server metadata\" API returns details describing the server. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. The model name and (optionally) version must be available in the URL. Health/Readiness/Liveness Probes \u00b6 The Model Readiness probe the question \"Did the model download and is it able to serve requests?\" and responds with the available model name(s). The Server Readiness/Liveness probes answer the question \"Is my service and its infrastructure running, healthy, and able to receive and process requests?\" To read more about liveness and readiness probe concepts, visit the Configure Liveness, Readiness and Startup Probes Kubernetes documentation. Payload Contents \u00b6 Model Ready \u00b6 The model ready endpoint returns the readiness probe response for the server along with the name of the model. Model Ready Response JSON Object \u00b6 $ready_model_response = { \"name\" : $string, \"ready\": $bool } Server Ready \u00b6 The server ready endpoint returns the readiness probe response for the server. Server Ready Response JSON Object \u00b6 $ready_server_response = { \"live\" : $bool, } Server Live \u00b6 The server live endpoint returns the liveness probe response for the server. Server Live Response JSON Objet \u00b6 $live_server_response = { \"live\" : $bool, } Server Metadata \u00b6 The server metadata endpoint provides information about the server. A server metadata request is made with an HTTP GET to a server metadata endpoint. In the corresponding response the HTTP body contains the Server Metadata Response JSON Object or the Server Metadata Response JSON Error Object . Server Metadata Response JSON Object \u00b6 A successful server metadata request is indicated by a 200 HTTP status code. The server metadata response object, identified as $metadata_server_response , is returned in the HTTP body. $metadata_server_response = { \"name\" : $string, \"version\" : $string, \"extensions\" : [ $string, ... ] } \u201cname\u201d : A descriptive name for the server. \"version\" : The server version. \u201cextensions\u201d : The extensions supported by the server. Currently, no standard extensions are defined. Individual inference servers may define and document their own extensions. Server Metadata Response JSON Error Object \u00b6 A failed server metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_server_error_response object. $metadata_server_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. Model Metadata \u00b6 The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. Model Metadata Response JSON Object \u00b6 A successful model metadata request is indicated by a 200 HTTP status code. The metadata response object, identified as $metadata_model_response , is returned in the HTTP body for every successful model metadata request. $metadata_model_response = { \"name\" : $string, \"versions\" : [ $string, ... ] #optional, \"platform\" : $string, \"inputs\" : [ $metadata_tensor, ... ], \"outputs\" : [ $metadata_tensor, ... ] } \u201cname\u201d : The name of the model. \"versions\" : The model versions that may be explicitly requested via the appropriate endpoint. Optional for servers that don\u2019t support versions. Optional for models that don\u2019t allow a version to be explicitly requested. \u201cplatform\u201d : The framework/backend for the model. See Platforms . \u201cinputs\u201d : The inputs required by the model. \u201coutputs\u201d : The outputs produced by the model. Each model input and output tensors\u2019 metadata is described with a $metadata_tensor object . $metadata_tensor = { \"name\" : $string, \"datatype\" : $string, \"shape\" : [ $number, ... ] } \u201cname\u201d : The name of the tensor. \"datatype\" : The data-type of the tensor elements as defined in Tensor Data Types . \"shape\" : The shape of the tensor. Variable-size dimensions are specified as -1. Model Metadata Response JSON Error Object \u00b6 A failed model metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_model_error_response object. $metadata_model_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. Inference \u00b6 An inference request is made with an HTTP POST to an inference endpoint. In the request the HTTP body contains the Inference Request JSON Object . In the corresponding response the HTTP body contains the Inference Response JSON Object or Inference Response JSON Error Object . See Inference Request Examples for some example HTTP/REST requests and responses. Inference Request JSON Object \u00b6 The inference request object, identified as $inference_request , is required in the HTTP body of the POST request. The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. $inference_request = { \"id\" : $string #optional, \"parameters\" : $parameters #optional, \"inputs\" : [ $request_input, ... ], \"outputs\" : [ $request_output, ... ] #optional } \"id\" : An identifier for this request. Optional, but if specified this identifier must be returned in the response. \"parameters\" : An object containing zero or more parameters for this inference request expressed as key/value pairs. See Parameters for more information. \"inputs\" : The input tensors. Each input is described using the $request_input schema defined in Request Input . \"outputs\" : The output tensors requested for this inference. Each requested output is described using the $request_output schema defined in Request Output . Optional, if not specified all outputs produced by the model will be returned using default $request_output settings. Request Input \u00b6 The $inference_request_input JSON describes an input to the model. If the input is batched, the shape and data must represent the full shape and contents of the entire batch. $inference_request_input = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the input tensor. \"shape\" : The shape of the input tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the input tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information. Request Output \u00b6 The $request_output JSON is used to request which output tensors should be returned from the model. $inference_request_output = { \"name\" : $string, \"parameters\" : $parameters #optional, } \"name\" : The name of the output tensor. \"parameters\" : An object containing zero or more parameters for this output expressed as key/value pairs. See Parameters for more information. Inference Response JSON Object \u00b6 A successful inference request is indicated by a 200 HTTP status code. The inference response object, identified as $inference_response , is returned in the HTTP body. $inference_response = { \"model_name\" : $string, \"model_version\" : $string #optional, \"id\" : $string, \"parameters\" : $parameters #optional, \"outputs\" : [ $response_output, ... ] } \"model_name\" : The name of the model used for inference. \"model_version\" : The specific model version used for inference. Inference servers that do not implement versioning should not provide this field in the response. \"id\" : The \"id\" identifier given in the request, if any. \"parameters\" : An object containing zero or more parameters for this response expressed as key/value pairs. See Parameters for more information. \"outputs\" : The output tensors. Each output is described using the $response_output schema defined in Response Output . Response Output \u00b6 The $response_output JSON describes an output from the model. If the output is batched, the shape and data represents the full shape of the entire batch. $response_output = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the output tensor. \"shape\" : The shape of the output tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the output tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information. Inference Response JSON Error Object \u00b6 A failed inference request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $inference_error_response object. $inference_error_response = { \"error\": <error message string> } \u201cerror\u201d : The descriptive message for the error. Parameters \u00b6 The $parameters JSON describes zero or more \u201cname\u201d/\u201dvalue\u201d pairs, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a $string, $number, or $boolean. $parameters = { $parameter, ... } $parameter = $string : $string | $number | $boolean Currently no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. Tensor Data \u00b6 Tensor data must be presented in row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Tensor elements may be presented in their nature multi-dimensional representation, or as a flattened one-dimensional representation. Tensor data given explicitly is provided in a JSON array. Each element of the array may be an integer, floating-point number, string or boolean value. The server can decide to coerce each element to the required type or return an error if an unexpected value is received. Note that fp16 and bf16 are problematic to communicate explicitly since there is not a standard fp16/bf16 representation across backends nor typically the programmatic support to create the fp16/bf16 representation for a JSON number. For example, the 2-dimensional matrix: [ 1 2 4 5 ] Can be represented in its natural format as: \"data\" : [ [ 1, 2 ], [ 4, 5 ] ] Or in a flattened one-dimensional representation: \"data\" : [ 1, 2, 4, 5 ] Tensor Data Types \u00b6 Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 ) --- Inference Request Examples \u00b6 The following example shows an inference request to a model with two inputs and one output. The HTTP Content-Length header gives the size of the JSON object. POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/json Content-Length: <xx> { \"id\" : \"42\", \"inputs\" : [ { \"name\" : \"input0\", \"shape\" : [ 2, 2 ], \"datatype\" : \"UINT32\", \"data\" : [ 1, 2, 3, 4 ] }, { \"name\" : \"input1\", \"shape\" : [ 3 ], \"datatype\" : \"BOOL\", \"data\" : [ true ] } ], \"outputs\" : [ { \"name\" : \"output0\" } ] } For the above request the inference server must return the \u201coutput0\u201d output tensor. Assuming the model returns a [ 3, 2 ] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/json Content-Length: <yy> { \"id\" : \"42\" \"outputs\" : [ { \"name\" : \"output0\", \"shape\" : [ 3, 2 ], \"datatype\" : \"FP32\", \"data\" : [ 1.0, 1.1, 2.0, 2.1, 3.0, 3.1 ] } ] } gRPC \u00b6 The GRPC API closely follows the concepts defined in the HTTP/REST API. A compliant server must implement the health, metadata, and inference APIs described in this section. API rpc Endpoint Request Message Response Message Inference ModelInfer ModelInferRequest ModelInferResponse Model Ready ModelReady [ModelReadyRequest] ModelReadyResponse Model Metadata ModelMetadata ModelMetadataRequest ModelMetadataResponse Server Ready ServerReady ServerReadyRequest ServerReadyResponse Server Live ServerLive ServerLiveRequest ServerLiveResponse For more detailed information on each endpoint and its contents, see API Definitions and Message Contents . See also: The gRPC endpoints, request/response messages and contents are defined in grpc_predict_v2.proto API Definitions \u00b6 The GRPC definition of the service is: // // Inference Server GRPC endpoints. // service GRPCInferenceService { // Check liveness of the inference server. rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} // Check readiness of the inference server. rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} // Check readiness of a model in the inference server. rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} // Get server metadata. rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {} // Get model metadata. rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} // Perform inference using a specific model. rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} } Message Contents \u00b6 Health \u00b6 A health request is made using the ServerLive, ServerReady, or ModelReady endpoint. For each of these endpoints errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. Server Live \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. The request and response messages for ServerLive are: message ServerLiveRequest {} message ServerLiveResponse { // True if the inference server is live, false if not live. bool live = 1; } Server Ready \u00b6 The ServerReady API indicates if the server is ready for inferencing. The request and response messages for ServerReady are: message ServerReadyRequest {} message ServerReadyResponse { // True if the inference server is ready, false if not ready. bool ready = 1; } Model Ready \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. The request and response messages for ModelReady are: message ModelReadyRequest { // The name of the model to check for readiness. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelReadyResponse { // True if the model is ready, false if not ready. bool ready = 1; } Metadata \u00b6 Server Metadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ServerMetadata are: message ServerMetadataRequest {} message ServerMetadataResponse { // The server name. string name = 1; // The server version. string version = 2; // The extensions supported by the server. repeated string extensions = 3; } Model Metadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelMetadata are: message ModelMetadataRequest { // The name of the model. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelMetadataResponse { // Metadata for a tensor. message TensorMetadata { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. A variable-size dimension is represented // by a -1 value. repeated int64 shape = 3; } // The model name. string name = 1; // The versions of the model available on the server. repeated string versions = 2; // The model's platform. See Platforms. string platform = 3; // The model's inputs. repeated TensorMetadata inputs = 4; // The model's outputs. repeated TensorMetadata outputs = 5; } Platforms \u00b6 A platform is a string indicating a DL/ML framework or backend. Platform is returned as part of the response to a Model Metadata request but is information only. The proposed inference APIs are generic relative to the DL/ML framework used by a model and so a client does not need to know the platform of a given model to use the API. Platform names use the format \u201c _ \u201d. The following platform names are allowed: tensorrt_plan : A TensorRT model encoded as a serialized engine or \u201cplan\u201d. tensorflow_graphdef : A TensorFlow model encoded as a GraphDef. tensorflow_savedmodel : A TensorFlow model encoded as a SavedModel. onnx_onnxv1 : A ONNX model encoded for ONNX Runtime. pytorch_torchscript : A PyTorch model encoded as TorchScript. mxnet_mxnet: An MXNet model caffe2_netdef : A Caffe2 model encoded as a NetDef. Inference \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelInfer are: message ModelInferRequest { // An input tensor for an inference request. message InferInputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional inference input tensor parameters. map<string, InferParameter> parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference request. InferTensorContents contents = 5; } // An output tensor requested for an inference request. message InferRequestedOutputTensor { // The tensor name. string name = 1; // Optional requested output tensor parameters. map<string, InferParameter> parameters = 2; } // The name of the model to use for inferencing. string model_name = 1; // The version of the model to use for inference. If not given the // server will choose a version based on the model and internal policy. string model_version = 2; // Optional identifier for the request. If specified will be // returned in the response. string id = 3; // Optional inference parameters. map<string, InferParameter> parameters = 4; // The input tensors for the inference. repeated InferInputTensor inputs = 5; // The requested output tensors for the inference. Optional, if not // specified all outputs produced by the model will be returned. repeated InferRequestedOutputTensor outputs = 6; // The data contained in an input tensor can be represented in \"raw\" // bytes form or in the repeated type that matches the tensor's data // type. To use the raw representation 'raw_input_contents' must be // initialized with data for each tensor in the same order as // 'inputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferInputTensor::contents must // not be specified for any input tensor. repeated bytes raw_input_contents = 7; } message ModelInferResponse { // An output tensor returned for an inference request. message InferOutputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional output tensor parameters. map<string, InferParameter> parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference response. InferTensorContents contents = 5; } // The name of the model used for inference. string model_name = 1; // The version of the model used for inference. string model_version = 2; // The id of the inference request if one was specified. string id = 3; // Optional inference response parameters. map<string, InferParameter> parameters = 4; // The output tensors holding inference results. repeated InferOutputTensor outputs = 5; // The data contained in an output tensor can be represented in // \"raw\" bytes form or in the repeated type that matches the // tensor's data type. To use the raw representation 'raw_output_contents' // must be initialized with data for each tensor in the same order as // 'outputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferOutputTensor::contents must // not be specified for any output tensor. repeated bytes raw_output_contents = 6; } Parameters \u00b6 The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Currently, no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. // // An inference parameter value. // message InferParameter { // The parameter value can be a string, an int64, a boolean // or a message specific to a predefined parameter. oneof parameter_choice { // A boolean parameter value. bool bool_param = 1; // An int64 parameter value. int64 int64_param = 2; // A string parameter value. string string_param = 3; } } Tensor Data \u00b6 In all representations tensor data must be flattened to a one-dimensional, row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Using a \"raw\" representation of tensors with ModelInferRequest::raw_input_contents and ModelInferResponse::raw_output_contents will typically allow higher performance due to the way protobuf allocation and reuse interacts with GRPC. For example, see https://github.com/grpc/grpc/issues/23231. An alternative to the \"raw\" representation is to use InferTensorContents to represent the tensor data in a format that matches the tensor's data type. // // The data contained in a tensor represented by the repeated type // that matches the tensor's data type. Protobuf oneof is not used // because oneofs cannot contain repeated fields. // message InferTensorContents { // Representation for BOOL data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bool bool_contents = 1; // Representation for INT8, INT16, and INT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated int32 int_contents = 2; // Representation for INT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated int64 int64_contents = 3; // Representation for UINT8, UINT16, and UINT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated uint32 uint_contents = 4; // Representation for UINT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated uint64 uint64_contents = 5; // Representation for FP32 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated float fp32_contents = 6; // Representation for FP64 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated double fp64_contents = 7; // Representation for BYTES data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bytes bytes_contents = 8; } Tensor Data Types \u00b6 Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 )","title":"Open Inference Protocol (V2 Inference Protocol)"},{"location":"modelserving/data_plane/v2_protocol/#open-inference-protocol-v2-inference-protocol","text":"For an inference server to be compliant with this protocol the server must implement the health, metadata, and inference V2 APIs . Optional features that are explicitly noted are not required. A compliant inference server may choose to implement the HTTP/REST API and/or the GRPC API . Check the model serving runtime table / the protocolVersion field in the runtime YAML to ensure V2 protocol is supported for model serving runtime that you are using. Note: For all API descriptions on this page, all strings in all contexts are case-sensitive. The V2 protocol supports an extension mechanism as a required part of the API, but this document does not propose any specific extensions. Any specific extensions will be proposed separately.","title":"Open Inference Protocol (V2 Inference Protocol)"},{"location":"modelserving/data_plane/v2_protocol/#note-on-changes-between-v1-v2","text":"V2 protocol does not currently support the explain endpoint like V1 protocol does. If this is a feature you wish to have in the V2 protocol, please submit a github issue .","title":"Note on changes between V1 &amp; V2"},{"location":"modelserving/data_plane/v2_protocol/#httprest","text":"The HTTP/REST API uses JSON because it is widely supported and language independent. In all JSON schemas shown in this document $number, $string, $boolean, $object and $array refer to the fundamental JSON types. #optional indicates an optional JSON field. See also: The HTTP/REST endpoints are defined in rest_predict_v2.yaml API Verb Path Request Payload Response Payload Inference POST v2/models/ [/versions/<model_version>]/infer $inference_request $inference_response Model Metadata GET v2/models/<model_name>[/versions/<model_version>] $metadata_model_response Server Ready GET v2/health/ready $ready_server_response Server Live GET v2/health/live $live_server_response Server Metadata GET v2 $metadata_server_response Model Ready GET v2/models/<model_name>[/versions/ ]/ready $ready_model_response ** path contents in [] are optional For more information regarding payload contents, see Payload Contents . The versions portion of the Path URLs (in [] ) is shown as optional to allow implementations that don\u2019t support versioning or for cases when the user does not want to specify a specific model version (in which case the server will choose a version based on its own policies). For example, if a model does not implement a version, the Model Metadata request path could look like v2/model/my_model . If the model has been configured to implement a version, the request path could look something like v2/models/my_model/versions/v10 , where the version of the model is v10.","title":"HTTP/REST"},{"location":"modelserving/data_plane/v2_protocol/#api-definitions","text":"API Definition Inference The /infer endpoint performs inference on a model. The response is the prediction result. Model Metadata The \"model metadata\" API is a per-model endpoint that returns details about the model passed in the path. Server Ready The \u201cserver ready\u201d health API indicates if all the models are ready for inferencing. The \u201cserver ready\u201d health API can be used directly to implement the Kubernetes readinessProbe Server Live The \u201cserver live\u201d health API indicates if the inference server is able to receive and respond to metadata and inference requests. The \u201cserver live\u201d API can be used directly to implement the Kubernetes livenessProbe. Server Metadata The \"server metadata\" API returns details describing the server. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. The model name and (optionally) version must be available in the URL.","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/#healthreadinessliveness-probes","text":"The Model Readiness probe the question \"Did the model download and is it able to serve requests?\" and responds with the available model name(s). The Server Readiness/Liveness probes answer the question \"Is my service and its infrastructure running, healthy, and able to receive and process requests?\" To read more about liveness and readiness probe concepts, visit the Configure Liveness, Readiness and Startup Probes Kubernetes documentation.","title":"Health/Readiness/Liveness Probes"},{"location":"modelserving/data_plane/v2_protocol/#payload-contents","text":"","title":"Payload Contents"},{"location":"modelserving/data_plane/v2_protocol/#model-ready","text":"The model ready endpoint returns the readiness probe response for the server along with the name of the model.","title":"Model Ready"},{"location":"modelserving/data_plane/v2_protocol/#model-ready-response-json-object","text":"$ready_model_response = { \"name\" : $string, \"ready\": $bool }","title":"Model Ready Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-ready","text":"The server ready endpoint returns the readiness probe response for the server.","title":"Server Ready"},{"location":"modelserving/data_plane/v2_protocol/#server-ready-response-json-object","text":"$ready_server_response = { \"live\" : $bool, }","title":"Server Ready Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-live","text":"The server live endpoint returns the liveness probe response for the server.","title":"Server Live"},{"location":"modelserving/data_plane/v2_protocol/#server-live-response-json-objet","text":"$live_server_response = { \"live\" : $bool, }","title":"Server Live Response JSON Objet"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata","text":"The server metadata endpoint provides information about the server. A server metadata request is made with an HTTP GET to a server metadata endpoint. In the corresponding response the HTTP body contains the Server Metadata Response JSON Object or the Server Metadata Response JSON Error Object .","title":"Server Metadata"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata-response-json-object","text":"A successful server metadata request is indicated by a 200 HTTP status code. The server metadata response object, identified as $metadata_server_response , is returned in the HTTP body. $metadata_server_response = { \"name\" : $string, \"version\" : $string, \"extensions\" : [ $string, ... ] } \u201cname\u201d : A descriptive name for the server. \"version\" : The server version. \u201cextensions\u201d : The extensions supported by the server. Currently, no standard extensions are defined. Individual inference servers may define and document their own extensions.","title":"Server Metadata Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata-response-json-error-object","text":"A failed server metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_server_error_response object. $metadata_server_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error.","title":"Server Metadata Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata","text":"The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error.","title":"Model Metadata"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata-response-json-object","text":"A successful model metadata request is indicated by a 200 HTTP status code. The metadata response object, identified as $metadata_model_response , is returned in the HTTP body for every successful model metadata request. $metadata_model_response = { \"name\" : $string, \"versions\" : [ $string, ... ] #optional, \"platform\" : $string, \"inputs\" : [ $metadata_tensor, ... ], \"outputs\" : [ $metadata_tensor, ... ] } \u201cname\u201d : The name of the model. \"versions\" : The model versions that may be explicitly requested via the appropriate endpoint. Optional for servers that don\u2019t support versions. Optional for models that don\u2019t allow a version to be explicitly requested. \u201cplatform\u201d : The framework/backend for the model. See Platforms . \u201cinputs\u201d : The inputs required by the model. \u201coutputs\u201d : The outputs produced by the model. Each model input and output tensors\u2019 metadata is described with a $metadata_tensor object . $metadata_tensor = { \"name\" : $string, \"datatype\" : $string, \"shape\" : [ $number, ... ] } \u201cname\u201d : The name of the tensor. \"datatype\" : The data-type of the tensor elements as defined in Tensor Data Types . \"shape\" : The shape of the tensor. Variable-size dimensions are specified as -1.","title":"Model Metadata Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata-response-json-error-object","text":"A failed model metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_model_error_response object. $metadata_model_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error.","title":"Model Metadata Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#inference","text":"An inference request is made with an HTTP POST to an inference endpoint. In the request the HTTP body contains the Inference Request JSON Object . In the corresponding response the HTTP body contains the Inference Response JSON Object or Inference Response JSON Error Object . See Inference Request Examples for some example HTTP/REST requests and responses.","title":"Inference"},{"location":"modelserving/data_plane/v2_protocol/#inference-request-json-object","text":"The inference request object, identified as $inference_request , is required in the HTTP body of the POST request. The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. $inference_request = { \"id\" : $string #optional, \"parameters\" : $parameters #optional, \"inputs\" : [ $request_input, ... ], \"outputs\" : [ $request_output, ... ] #optional } \"id\" : An identifier for this request. Optional, but if specified this identifier must be returned in the response. \"parameters\" : An object containing zero or more parameters for this inference request expressed as key/value pairs. See Parameters for more information. \"inputs\" : The input tensors. Each input is described using the $request_input schema defined in Request Input . \"outputs\" : The output tensors requested for this inference. Each requested output is described using the $request_output schema defined in Request Output . Optional, if not specified all outputs produced by the model will be returned using default $request_output settings.","title":"Inference Request JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#request-input","text":"The $inference_request_input JSON describes an input to the model. If the input is batched, the shape and data must represent the full shape and contents of the entire batch. $inference_request_input = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the input tensor. \"shape\" : The shape of the input tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the input tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information.","title":"Request Input"},{"location":"modelserving/data_plane/v2_protocol/#request-output","text":"The $request_output JSON is used to request which output tensors should be returned from the model. $inference_request_output = { \"name\" : $string, \"parameters\" : $parameters #optional, } \"name\" : The name of the output tensor. \"parameters\" : An object containing zero or more parameters for this output expressed as key/value pairs. See Parameters for more information.","title":"Request Output"},{"location":"modelserving/data_plane/v2_protocol/#inference-response-json-object","text":"A successful inference request is indicated by a 200 HTTP status code. The inference response object, identified as $inference_response , is returned in the HTTP body. $inference_response = { \"model_name\" : $string, \"model_version\" : $string #optional, \"id\" : $string, \"parameters\" : $parameters #optional, \"outputs\" : [ $response_output, ... ] } \"model_name\" : The name of the model used for inference. \"model_version\" : The specific model version used for inference. Inference servers that do not implement versioning should not provide this field in the response. \"id\" : The \"id\" identifier given in the request, if any. \"parameters\" : An object containing zero or more parameters for this response expressed as key/value pairs. See Parameters for more information. \"outputs\" : The output tensors. Each output is described using the $response_output schema defined in Response Output .","title":"Inference Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#response-output","text":"The $response_output JSON describes an output from the model. If the output is batched, the shape and data represents the full shape of the entire batch. $response_output = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the output tensor. \"shape\" : The shape of the output tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the output tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information.","title":"Response Output"},{"location":"modelserving/data_plane/v2_protocol/#inference-response-json-error-object","text":"A failed inference request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $inference_error_response object. $inference_error_response = { \"error\": <error message string> } \u201cerror\u201d : The descriptive message for the error.","title":"Inference Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#parameters","text":"The $parameters JSON describes zero or more \u201cname\u201d/\u201dvalue\u201d pairs, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a $string, $number, or $boolean. $parameters = { $parameter, ... } $parameter = $string : $string | $number | $boolean Currently no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities.","title":"Parameters"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data","text":"Tensor data must be presented in row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Tensor elements may be presented in their nature multi-dimensional representation, or as a flattened one-dimensional representation. Tensor data given explicitly is provided in a JSON array. Each element of the array may be an integer, floating-point number, string or boolean value. The server can decide to coerce each element to the required type or return an error if an unexpected value is received. Note that fp16 and bf16 are problematic to communicate explicitly since there is not a standard fp16/bf16 representation across backends nor typically the programmatic support to create the fp16/bf16 representation for a JSON number. For example, the 2-dimensional matrix: [ 1 2 4 5 ] Can be represented in its natural format as: \"data\" : [ [ 1, 2 ], [ 4, 5 ] ] Or in a flattened one-dimensional representation: \"data\" : [ 1, 2, 4, 5 ]","title":"Tensor Data"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data-types","text":"Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 ) ---","title":"Tensor Data Types"},{"location":"modelserving/data_plane/v2_protocol/#inference-request-examples","text":"The following example shows an inference request to a model with two inputs and one output. The HTTP Content-Length header gives the size of the JSON object. POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/json Content-Length: <xx> { \"id\" : \"42\", \"inputs\" : [ { \"name\" : \"input0\", \"shape\" : [ 2, 2 ], \"datatype\" : \"UINT32\", \"data\" : [ 1, 2, 3, 4 ] }, { \"name\" : \"input1\", \"shape\" : [ 3 ], \"datatype\" : \"BOOL\", \"data\" : [ true ] } ], \"outputs\" : [ { \"name\" : \"output0\" } ] } For the above request the inference server must return the \u201coutput0\u201d output tensor. Assuming the model returns a [ 3, 2 ] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/json Content-Length: <yy> { \"id\" : \"42\" \"outputs\" : [ { \"name\" : \"output0\", \"shape\" : [ 3, 2 ], \"datatype\" : \"FP32\", \"data\" : [ 1.0, 1.1, 2.0, 2.1, 3.0, 3.1 ] } ] }","title":"Inference Request Examples"},{"location":"modelserving/data_plane/v2_protocol/#grpc","text":"The GRPC API closely follows the concepts defined in the HTTP/REST API. A compliant server must implement the health, metadata, and inference APIs described in this section. API rpc Endpoint Request Message Response Message Inference ModelInfer ModelInferRequest ModelInferResponse Model Ready ModelReady [ModelReadyRequest] ModelReadyResponse Model Metadata ModelMetadata ModelMetadataRequest ModelMetadataResponse Server Ready ServerReady ServerReadyRequest ServerReadyResponse Server Live ServerLive ServerLiveRequest ServerLiveResponse For more detailed information on each endpoint and its contents, see API Definitions and Message Contents . See also: The gRPC endpoints, request/response messages and contents are defined in grpc_predict_v2.proto","title":"gRPC"},{"location":"modelserving/data_plane/v2_protocol/#api-definitions_1","text":"The GRPC definition of the service is: // // Inference Server GRPC endpoints. // service GRPCInferenceService { // Check liveness of the inference server. rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} // Check readiness of the inference server. rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} // Check readiness of a model in the inference server. rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} // Get server metadata. rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {} // Get model metadata. rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} // Perform inference using a specific model. rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} }","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/#message-contents","text":"","title":"Message Contents"},{"location":"modelserving/data_plane/v2_protocol/#health","text":"A health request is made using the ServerLive, ServerReady, or ModelReady endpoint. For each of these endpoints errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure.","title":"Health"},{"location":"modelserving/data_plane/v2_protocol/#server-live_1","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. The request and response messages for ServerLive are: message ServerLiveRequest {} message ServerLiveResponse { // True if the inference server is live, false if not live. bool live = 1; }","title":"Server Live"},{"location":"modelserving/data_plane/v2_protocol/#server-ready_1","text":"The ServerReady API indicates if the server is ready for inferencing. The request and response messages for ServerReady are: message ServerReadyRequest {} message ServerReadyResponse { // True if the inference server is ready, false if not ready. bool ready = 1; }","title":"Server Ready"},{"location":"modelserving/data_plane/v2_protocol/#model-ready_1","text":"The ModelReady API indicates if a specific model is ready for inferencing. The request and response messages for ModelReady are: message ModelReadyRequest { // The name of the model to check for readiness. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelReadyResponse { // True if the model is ready, false if not ready. bool ready = 1; }","title":"Model Ready"},{"location":"modelserving/data_plane/v2_protocol/#metadata","text":"","title":"Metadata"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata_1","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ServerMetadata are: message ServerMetadataRequest {} message ServerMetadataResponse { // The server name. string name = 1; // The server version. string version = 2; // The extensions supported by the server. repeated string extensions = 3; }","title":"Server Metadata"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata_1","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelMetadata are: message ModelMetadataRequest { // The name of the model. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelMetadataResponse { // Metadata for a tensor. message TensorMetadata { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. A variable-size dimension is represented // by a -1 value. repeated int64 shape = 3; } // The model name. string name = 1; // The versions of the model available on the server. repeated string versions = 2; // The model's platform. See Platforms. string platform = 3; // The model's inputs. repeated TensorMetadata inputs = 4; // The model's outputs. repeated TensorMetadata outputs = 5; }","title":"Model Metadata"},{"location":"modelserving/data_plane/v2_protocol/#platforms","text":"A platform is a string indicating a DL/ML framework or backend. Platform is returned as part of the response to a Model Metadata request but is information only. The proposed inference APIs are generic relative to the DL/ML framework used by a model and so a client does not need to know the platform of a given model to use the API. Platform names use the format \u201c _ \u201d. The following platform names are allowed: tensorrt_plan : A TensorRT model encoded as a serialized engine or \u201cplan\u201d. tensorflow_graphdef : A TensorFlow model encoded as a GraphDef. tensorflow_savedmodel : A TensorFlow model encoded as a SavedModel. onnx_onnxv1 : A ONNX model encoded for ONNX Runtime. pytorch_torchscript : A PyTorch model encoded as TorchScript. mxnet_mxnet: An MXNet model caffe2_netdef : A Caffe2 model encoded as a NetDef.","title":"Platforms"},{"location":"modelserving/data_plane/v2_protocol/#inference_1","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelInfer are: message ModelInferRequest { // An input tensor for an inference request. message InferInputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional inference input tensor parameters. map<string, InferParameter> parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference request. InferTensorContents contents = 5; } // An output tensor requested for an inference request. message InferRequestedOutputTensor { // The tensor name. string name = 1; // Optional requested output tensor parameters. map<string, InferParameter> parameters = 2; } // The name of the model to use for inferencing. string model_name = 1; // The version of the model to use for inference. If not given the // server will choose a version based on the model and internal policy. string model_version = 2; // Optional identifier for the request. If specified will be // returned in the response. string id = 3; // Optional inference parameters. map<string, InferParameter> parameters = 4; // The input tensors for the inference. repeated InferInputTensor inputs = 5; // The requested output tensors for the inference. Optional, if not // specified all outputs produced by the model will be returned. repeated InferRequestedOutputTensor outputs = 6; // The data contained in an input tensor can be represented in \"raw\" // bytes form or in the repeated type that matches the tensor's data // type. To use the raw representation 'raw_input_contents' must be // initialized with data for each tensor in the same order as // 'inputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferInputTensor::contents must // not be specified for any input tensor. repeated bytes raw_input_contents = 7; } message ModelInferResponse { // An output tensor returned for an inference request. message InferOutputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional output tensor parameters. map<string, InferParameter> parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference response. InferTensorContents contents = 5; } // The name of the model used for inference. string model_name = 1; // The version of the model used for inference. string model_version = 2; // The id of the inference request if one was specified. string id = 3; // Optional inference response parameters. map<string, InferParameter> parameters = 4; // The output tensors holding inference results. repeated InferOutputTensor outputs = 5; // The data contained in an output tensor can be represented in // \"raw\" bytes form or in the repeated type that matches the // tensor's data type. To use the raw representation 'raw_output_contents' // must be initialized with data for each tensor in the same order as // 'outputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferOutputTensor::contents must // not be specified for any output tensor. repeated bytes raw_output_contents = 6; }","title":"Inference"},{"location":"modelserving/data_plane/v2_protocol/#parameters_1","text":"The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Currently, no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. // // An inference parameter value. // message InferParameter { // The parameter value can be a string, an int64, a boolean // or a message specific to a predefined parameter. oneof parameter_choice { // A boolean parameter value. bool bool_param = 1; // An int64 parameter value. int64 int64_param = 2; // A string parameter value. string string_param = 3; } }","title":"Parameters"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data_1","text":"In all representations tensor data must be flattened to a one-dimensional, row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Using a \"raw\" representation of tensors with ModelInferRequest::raw_input_contents and ModelInferResponse::raw_output_contents will typically allow higher performance due to the way protobuf allocation and reuse interacts with GRPC. For example, see https://github.com/grpc/grpc/issues/23231. An alternative to the \"raw\" representation is to use InferTensorContents to represent the tensor data in a format that matches the tensor's data type. // // The data contained in a tensor represented by the repeated type // that matches the tensor's data type. Protobuf oneof is not used // because oneofs cannot contain repeated fields. // message InferTensorContents { // Representation for BOOL data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bool bool_contents = 1; // Representation for INT8, INT16, and INT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated int32 int_contents = 2; // Representation for INT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated int64 int64_contents = 3; // Representation for UINT8, UINT16, and UINT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated uint32 uint_contents = 4; // Representation for UINT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated uint64 uint64_contents = 5; // Representation for FP32 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated float fp32_contents = 6; // Representation for FP64 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated double fp64_contents = 7; // Representation for BYTES data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bytes bytes_contents = 8; }","title":"Tensor Data"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data-types_1","text":"Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 )","title":"Tensor Data Types"},{"location":"modelserving/detect/aif/germancredit/","text":"Bias detection on an InferenceService using AIF360 \u00b6 This is an example of how to get bias metrics using AI Fairness 360 (AIF360) on KServe. AI Fairness 360, an LF AI incubation project, is an extensible open source toolkit that can help users examine, report, and mitigate discrimination and bias in machine learning models throughout the AI application lifecycle. We will be using the German Credit dataset maintained by the UC Irvine Machine Learning Repository . The German Credit dataset is a dataset that contains data as to whether or not a creditor gave a loan applicant access to a loan along with data about the applicant. The data includes relevant data on an applicant's credit history, savings, and employment as well as some data on the applicant's demographic such as age, sex, and marital status. Data like credit history, savings, and employment can be used by creditors to accurately predict the probability that an applicant will repay their loans, however, data such as age and sex should not be used to decide whether an applicant should be given a loan. We would like to be able to check if these \"protected classes\" are being used in a model's predictions. In this example we will feed the model some predictions and calculate metrics based off of the predictions the model makes. We will be using KServe payload logging capability collect the metrics. These metrics will give insight as to whether or not the model is biased for or against any protected classes. In this example we will look at the bias our deployed model has on those of age > 25 vs. those of age <= 25 and see if creditors are treating either unfairly. Sample resources for deploying the example can be found here Create the InferenceService \u00b6 Apply the CRD kubectl kubectl apply -f bias.yaml Expected Output $ inferenceservice.serving.kserve.io/german-credit created Deploy the message dumper (sample backend receiver for payload logs) \u00b6 Apply the message-dumper CRD which will collect the logs that are created when running predictions on the inferenceservice. In production setup, instead of message-dumper Kafka can be used to receive payload logs kubectl kubectl apply -f message-dumper.yaml Expected Output service.serving.knative.dev/message-dumper created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = german-credit SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python simulate_predicts.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict ${ SERVICE_HOSTNAME } Process payload logs for metrics calculation \u00b6 Run json_from_logs.py which will craft a payload that AIF can interpret. First, the events logs are taken from the message-dumper and then those logs are parsed to match inputs with outputs. Then the input/outputs pairs are all combined into a list of inputs and a list of outputs for AIF to interpret. A data.json file should have been created in this folder which contains the json payload. python json_from_logs.py Run an explanation \u00b6 Finally, now that we have collected a number of our model's predictions and their corresponding inputs we will send these to the AIF server to calculate the bias metrics. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Interpreting the results \u00b6 Now let's look at one of the metrics. In this example disparate impact represents the ratio between the probability of applicants of the privileged class (age > 25) getting a loan and the probability of applicants of the unprivileged class (age <= 25) getting a loan P(Y=1|D=privileged)/P(Y=1|D=unprivileged) . Since, in the sample output below, the disparate impact is less that 1 then the probability that an applicant whose age is greater than 25 gets a loan is significantly higher than the probability that an applicant whose age is less than or equal to 25 gets a loan. This in and of itself is not proof that the model is biased, but does hint that there may be some bias and a deeper look may be needed. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Expected Output Sending bias query... TIME TAKEN: 0 .21137404441833496 <Response [ 200 ] > base_rate : 0 .9329608938547486 consistency : [ 0 .982122905027933 ] disparate_impact : 0 .52 num_instances : 179 .0 num_negatives : 12 .0 num_positives : 167 .0 statistical_parity_difference : -0.48 Dataset \u00b6 The dataset used in this example is the German Credit dataset maintained by the UC Irvine Machine Learning Repository . Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.","title":"AIF Bias Detector"},{"location":"modelserving/detect/aif/germancredit/#bias-detection-on-an-inferenceservice-using-aif360","text":"This is an example of how to get bias metrics using AI Fairness 360 (AIF360) on KServe. AI Fairness 360, an LF AI incubation project, is an extensible open source toolkit that can help users examine, report, and mitigate discrimination and bias in machine learning models throughout the AI application lifecycle. We will be using the German Credit dataset maintained by the UC Irvine Machine Learning Repository . The German Credit dataset is a dataset that contains data as to whether or not a creditor gave a loan applicant access to a loan along with data about the applicant. The data includes relevant data on an applicant's credit history, savings, and employment as well as some data on the applicant's demographic such as age, sex, and marital status. Data like credit history, savings, and employment can be used by creditors to accurately predict the probability that an applicant will repay their loans, however, data such as age and sex should not be used to decide whether an applicant should be given a loan. We would like to be able to check if these \"protected classes\" are being used in a model's predictions. In this example we will feed the model some predictions and calculate metrics based off of the predictions the model makes. We will be using KServe payload logging capability collect the metrics. These metrics will give insight as to whether or not the model is biased for or against any protected classes. In this example we will look at the bias our deployed model has on those of age > 25 vs. those of age <= 25 and see if creditors are treating either unfairly. Sample resources for deploying the example can be found here","title":"Bias detection on an InferenceService using AIF360"},{"location":"modelserving/detect/aif/germancredit/#create-the-inferenceservice","text":"Apply the CRD kubectl kubectl apply -f bias.yaml Expected Output $ inferenceservice.serving.kserve.io/german-credit created","title":"Create the InferenceService"},{"location":"modelserving/detect/aif/germancredit/#deploy-the-message-dumper-sample-backend-receiver-for-payload-logs","text":"Apply the message-dumper CRD which will collect the logs that are created when running predictions on the inferenceservice. In production setup, instead of message-dumper Kafka can be used to receive payload logs kubectl kubectl apply -f message-dumper.yaml Expected Output service.serving.knative.dev/message-dumper created","title":"Deploy the message dumper (sample backend receiver for payload logs)"},{"location":"modelserving/detect/aif/germancredit/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = german-credit SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python simulate_predicts.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict ${ SERVICE_HOSTNAME }","title":"Run a prediction"},{"location":"modelserving/detect/aif/germancredit/#process-payload-logs-for-metrics-calculation","text":"Run json_from_logs.py which will craft a payload that AIF can interpret. First, the events logs are taken from the message-dumper and then those logs are parsed to match inputs with outputs. Then the input/outputs pairs are all combined into a list of inputs and a list of outputs for AIF to interpret. A data.json file should have been created in this folder which contains the json payload. python json_from_logs.py","title":"Process payload logs for metrics calculation"},{"location":"modelserving/detect/aif/germancredit/#run-an-explanation","text":"Finally, now that we have collected a number of our model's predictions and their corresponding inputs we will send these to the AIF server to calculate the bias metrics. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json","title":"Run an explanation"},{"location":"modelserving/detect/aif/germancredit/#interpreting-the-results","text":"Now let's look at one of the metrics. In this example disparate impact represents the ratio between the probability of applicants of the privileged class (age > 25) getting a loan and the probability of applicants of the unprivileged class (age <= 25) getting a loan P(Y=1|D=privileged)/P(Y=1|D=unprivileged) . Since, in the sample output below, the disparate impact is less that 1 then the probability that an applicant whose age is greater than 25 gets a loan is significantly higher than the probability that an applicant whose age is less than or equal to 25 gets a loan. This in and of itself is not proof that the model is biased, but does hint that there may be some bias and a deeper look may be needed. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Expected Output Sending bias query... TIME TAKEN: 0 .21137404441833496 <Response [ 200 ] > base_rate : 0 .9329608938547486 consistency : [ 0 .982122905027933 ] disparate_impact : 0 .52 num_instances : 179 .0 num_negatives : 12 .0 num_positives : 167 .0 statistical_parity_difference : -0.48","title":"Interpreting the results"},{"location":"modelserving/detect/aif/germancredit/#dataset","text":"The dataset used in this example is the German Credit dataset maintained by the UC Irvine Machine Learning Repository . Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.","title":"Dataset"},{"location":"modelserving/detect/aif/germancredit/server/","text":"Logistic Regression Model on the German Credit dataset \u00b6 Build a development docker image \u00b6 To build a development image first download these files and move them into the server/ folder - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc First build your docker image by changing directory to kserve/python and replacing dockeruser with your docker username in the snippet below (running this will take some time). docker build -t dockeruser/aifserver:latest -f aiffairness.Dockerfile . Then push your docker image to your dockerhub repo (this will take some time) docker push dockeruser/aifserver:latest Once your docker image is pushed you can pull the image from dockeruser/aifserver:latest when deploying an inferenceservice by specifying the image in the yaml file.","title":"Logistic Regression Model on the German Credit dataset"},{"location":"modelserving/detect/aif/germancredit/server/#logistic-regression-model-on-the-german-credit-dataset","text":"","title":"Logistic Regression Model on the German Credit dataset"},{"location":"modelserving/detect/aif/germancredit/server/#build-a-development-docker-image","text":"To build a development image first download these files and move them into the server/ folder - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc First build your docker image by changing directory to kserve/python and replacing dockeruser with your docker username in the snippet below (running this will take some time). docker build -t dockeruser/aifserver:latest -f aiffairness.Dockerfile . Then push your docker image to your dockerhub repo (this will take some time) docker push dockeruser/aifserver:latest Once your docker image is pushed you can pull the image from dockeruser/aifserver:latest when deploying an inferenceservice by specifying the image in the yaml file.","title":"Build a development docker image"},{"location":"modelserving/detect/alibi_detect/alibi_detect/","text":"Deploy InferenceService with Alibi Outlier/Drift Detector \u00b6 In order to trust and reliably act on model predictions, it is crucial to monitor the distribution of the incoming requests via various different type of detectors. KServe integrates Alibi Detect with the following components: Drift detector checks when the distribution of incoming requests is diverging from a reference distribution such as that of the training data. Outlier detector flags single instances which do not follow the training distribution. The architecture used is shown below and links the payload logging available within KServe with asynchronous processing of those payloads in KNative to detect outliers. CIFAR10 Outlier Detector \u00b6 A CIFAR10 Outlier Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18. CIFAR10 Drift Detector \u00b6 A CIFAR10 Drift Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"Alibi Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#deploy-inferenceservice-with-alibi-outlierdrift-detector","text":"In order to trust and reliably act on model predictions, it is crucial to monitor the distribution of the incoming requests via various different type of detectors. KServe integrates Alibi Detect with the following components: Drift detector checks when the distribution of incoming requests is diverging from a reference distribution such as that of the training data. Outlier detector flags single instances which do not follow the training distribution. The architecture used is shown below and links the payload logging available within KServe with asynchronous processing of those payloads in KNative to detect outliers.","title":"Deploy InferenceService with Alibi Outlier/Drift Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#cifar10-outlier-detector","text":"A CIFAR10 Outlier Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"CIFAR10 Outlier Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#cifar10-drift-detector","text":"A CIFAR10 Drift Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"CIFAR10 Drift Detector"},{"location":"modelserving/detect/art/mnist/","text":"Using ART to get adversarial examples for MNIST classifications \u00b6 This is an example to show how adversarially modified inputs can trick models to predict incorrectly to highlight model vulnerability to adversarial attacks. It is using the Adversarial Robustness Toolbox (ART) on KServe. ART provides tools that enable developers to evaluate, defend, and verify ML models and applications against adversarial threats. Apart from giving capabilities to craft adversarial attacks , it also provides algorithms to defend against them. We will be using the MNIST dataset which is a dataset of handwritten digits and find adversarial examples which can make the model predict a classification incorrectly, thereby showing the vulnerability of the model against adversarial attacks. Sample resources for deploying the example can be found here To deploy the inferenceservice with v1beta1 API kubectl apply -f art.yaml Then find the url kubectl get inferenceservice NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE artserver http://artserver.somecluster/v1/models/artserver True 100 40m Explanation \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = artserver SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After some time you should see a pop up containing the explanation, similar to the image below. If a pop up does not display and the message \"Unable to find an adversarial example.\" appears then an adversarial example could not be found for the image given in a timely manner. If a pop up does display then the image on the left is the original image and the image on the right is the adversarial example. The labels above both images represent what classification the model made for each individual image. The Square Attack method used in this example creates a random update at each iteration and adds this update to the adversarial input if it makes a misclassification more likely (more specifically, if it improves the objective function). Once enough random updates are added together and the model misclassifies then the resulting adversarial input will be returned and displayed. To try a different MNIST example add an integer to the end of the query between 0-9,999. The integer chosen will be the index of the image to be chosen in the MNIST dataset. Or to try a file with custom data add the file path to the end. Keep in mind that the data format must be {\"instances\": [<image>, <label>]} python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } ./input.json Stopping the Inference Service \u00b6 kubectl delete -f art.yaml Build a Development ART Explainer Docker Image \u00b6 If you would like to build a development image for the ART Explainer then follow these instructions Troubleshooting \u00b6 <504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to art.yaml and increase resources. If you see Configuration \"artserver-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"ART Adversarial Detector"},{"location":"modelserving/detect/art/mnist/#using-art-to-get-adversarial-examples-for-mnist-classifications","text":"This is an example to show how adversarially modified inputs can trick models to predict incorrectly to highlight model vulnerability to adversarial attacks. It is using the Adversarial Robustness Toolbox (ART) on KServe. ART provides tools that enable developers to evaluate, defend, and verify ML models and applications against adversarial threats. Apart from giving capabilities to craft adversarial attacks , it also provides algorithms to defend against them. We will be using the MNIST dataset which is a dataset of handwritten digits and find adversarial examples which can make the model predict a classification incorrectly, thereby showing the vulnerability of the model against adversarial attacks. Sample resources for deploying the example can be found here To deploy the inferenceservice with v1beta1 API kubectl apply -f art.yaml Then find the url kubectl get inferenceservice NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE artserver http://artserver.somecluster/v1/models/artserver True 100 40m","title":"Using ART to get adversarial examples for MNIST classifications"},{"location":"modelserving/detect/art/mnist/#explanation","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = artserver SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After some time you should see a pop up containing the explanation, similar to the image below. If a pop up does not display and the message \"Unable to find an adversarial example.\" appears then an adversarial example could not be found for the image given in a timely manner. If a pop up does display then the image on the left is the original image and the image on the right is the adversarial example. The labels above both images represent what classification the model made for each individual image. The Square Attack method used in this example creates a random update at each iteration and adds this update to the adversarial input if it makes a misclassification more likely (more specifically, if it improves the objective function). Once enough random updates are added together and the model misclassifies then the resulting adversarial input will be returned and displayed. To try a different MNIST example add an integer to the end of the query between 0-9,999. The integer chosen will be the index of the image to be chosen in the MNIST dataset. Or to try a file with custom data add the file path to the end. Keep in mind that the data format must be {\"instances\": [<image>, <label>]} python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } ./input.json","title":"Explanation"},{"location":"modelserving/detect/art/mnist/#stopping-the-inference-service","text":"kubectl delete -f art.yaml","title":"Stopping the Inference Service"},{"location":"modelserving/detect/art/mnist/#build-a-development-art-explainer-docker-image","text":"If you would like to build a development image for the ART Explainer then follow these instructions","title":"Build a Development ART Explainer Docker Image"},{"location":"modelserving/detect/art/mnist/#troubleshooting","text":"<504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to art.yaml and increase resources. If you see Configuration \"artserver-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"Troubleshooting"},{"location":"modelserving/explainer/explainer/","text":"InferenceService Explainer \u00b6 Model explainability answers the question: \"Why did my model make this prediction\" for a given instance. KServe integrates with Alibi Explainer which implements a black-box algorithm by generating a lot of similar looking instances for a given instance and send out to the model server to produce an explanation. Additionally, KServe also integrates with The AI Explainability 360 (AIX360) toolkit, an LF AI Foundation incubation project, which is an open-source library that supports the interpretability and explainability of datasets and machine learning models. The AI Explainability 360 Python package includes a comprehensive set of algorithms that cover different dimensions of explanations along with proxy explainability metrics. In addition to native algorithms, AIX360 also provides algorithms from LIME and Shap. Explainer Examples Deploy Alibi Image Explainer Imagenet Explainer Deploy Alibi Income Explainer Income Explainer Deploy Alibi Text Explainer Alibi Text Explainer","title":"Concept"},{"location":"modelserving/explainer/explainer/#inferenceservice-explainer","text":"Model explainability answers the question: \"Why did my model make this prediction\" for a given instance. KServe integrates with Alibi Explainer which implements a black-box algorithm by generating a lot of similar looking instances for a given instance and send out to the model server to produce an explanation. Additionally, KServe also integrates with The AI Explainability 360 (AIX360) toolkit, an LF AI Foundation incubation project, which is an open-source library that supports the interpretability and explainability of datasets and machine learning models. The AI Explainability 360 Python package includes a comprehensive set of algorithms that cover different dimensions of explanations along with proxy explainability metrics. In addition to native algorithms, AIX360 also provides algorithms from LIME and Shap. Explainer Examples Deploy Alibi Image Explainer Imagenet Explainer Deploy Alibi Income Explainer Income Explainer Deploy Alibi Text Explainer Alibi Text Explainer","title":"InferenceService Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/","text":"Using AIX to get explanations for MNIST classifications \u00b6 This is an example of how to explain model predictions using AI Explainability 360 (AIX360) on KServe. We will be using mnist dataset for handwritten digits for this model and explain how the model decides the predicted results. Create the InferenceService with AIX Explainer \u00b6 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"aix-explainer\" namespace : default spec : predictor : containers : - name : predictor image : aipipeline/rf-predictor:0.4.1 command : [ \"python\" , \"-m\" , \"rfserver\" , \"--model_name\" , \"aix-explainer\" ] imagePullPolicy : Always explainer : containers : - name : explainer image : kserve/aix-explainer:v0.10.1 args : - --model_name - aix-explainer - --explainer_type - LimeImages - --num_samples - \"100\" - --top_labels - \"10\" - --min_weight - \"0.01\" imagePullPolicy : Always resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : \"1\" memory : 2Gi To deploy the InferenceService with v1beta1 API kubectl kubectl apply -f aix-explainer.yaml Then find the url. kubectl kubectl get inferenceservice aix-explainer NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE aix-explainer http://aix-explainer.default.example.com True 100 aix-explainer-predictor-default-00001 43m Run Explanation \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the example code for model training and explainer client can be found here . MODEL_NAME = aix-explainer SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After a bit of time you should see a pop up containing the explanation, similar to the image below. The LIME method used in this example highlights the pixels in red that score above a certain confidence value for indicating a classification. The explanation shown will contain a collection of images that are highlighted paired with a title to describe the context. For each title and image pair, the title will say Positive for <X> Actual <Y> to denote that is the classification that LIME is testing for and is the correct label for that image. To give an example, the top-left image with the title \"Positive for 2 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 2 (where 2 is also the correct classification). Similarly, the bottom-right image with the title \"Positive for 0 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 0 (where 2 is the correct classification). If the model were to incorrectly classify the image as 0, then you could get an explanation of why by looking at the highlighted pixels as being especially troublesome. By raising and lowering the min_weight parameter in the deployment yaml you can test to see which pixels your model believes are the most and least relevant for each classification. To try a different MNIST example add an integer to the end of the query between 0-10,000. The integer chosen will be the index of the image to be chosen in the MNIST dataset. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 To try different parameters with explainer, add another string json argument to specify the parameters. Supported modified parameters: top_labels, segmentation_alg, num_samples, positive_only, and min_weight. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 '{\"top_labels\":\"10\"}' Stopping the Inference Service \u00b6 kubectl delete -f aix-explainer.yaml Build a Development AIX Model Explainer Docker Image \u00b6 If you would like to build a development image for the AIX Model Explainer then follow these instructions Troubleshooting \u00b6 <504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to aix-explainer.yaml and increase resources. Or to lower the number of allowed samples go to aix-explainer.yaml and add a flag to explainer: command: '--num_samples' (the default number of samples is 1000) If you see Configuration \"aixserver-explainer-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"AIX Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/#using-aix-to-get-explanations-for-mnist-classifications","text":"This is an example of how to explain model predictions using AI Explainability 360 (AIX360) on KServe. We will be using mnist dataset for handwritten digits for this model and explain how the model decides the predicted results.","title":"Using AIX to get explanations for MNIST classifications"},{"location":"modelserving/explainer/aix/mnist/aix/#create-the-inferenceservice-with-aix-explainer","text":"apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"aix-explainer\" namespace : default spec : predictor : containers : - name : predictor image : aipipeline/rf-predictor:0.4.1 command : [ \"python\" , \"-m\" , \"rfserver\" , \"--model_name\" , \"aix-explainer\" ] imagePullPolicy : Always explainer : containers : - name : explainer image : kserve/aix-explainer:v0.10.1 args : - --model_name - aix-explainer - --explainer_type - LimeImages - --num_samples - \"100\" - --top_labels - \"10\" - --min_weight - \"0.01\" imagePullPolicy : Always resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : \"1\" memory : 2Gi To deploy the InferenceService with v1beta1 API kubectl kubectl apply -f aix-explainer.yaml Then find the url. kubectl kubectl get inferenceservice aix-explainer NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE aix-explainer http://aix-explainer.default.example.com True 100 aix-explainer-predictor-default-00001 43m","title":"Create the InferenceService with AIX Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/#run-explanation","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the example code for model training and explainer client can be found here . MODEL_NAME = aix-explainer SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After a bit of time you should see a pop up containing the explanation, similar to the image below. The LIME method used in this example highlights the pixels in red that score above a certain confidence value for indicating a classification. The explanation shown will contain a collection of images that are highlighted paired with a title to describe the context. For each title and image pair, the title will say Positive for <X> Actual <Y> to denote that is the classification that LIME is testing for and is the correct label for that image. To give an example, the top-left image with the title \"Positive for 2 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 2 (where 2 is also the correct classification). Similarly, the bottom-right image with the title \"Positive for 0 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 0 (where 2 is the correct classification). If the model were to incorrectly classify the image as 0, then you could get an explanation of why by looking at the highlighted pixels as being especially troublesome. By raising and lowering the min_weight parameter in the deployment yaml you can test to see which pixels your model believes are the most and least relevant for each classification. To try a different MNIST example add an integer to the end of the query between 0-10,000. The integer chosen will be the index of the image to be chosen in the MNIST dataset. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 To try different parameters with explainer, add another string json argument to specify the parameters. Supported modified parameters: top_labels, segmentation_alg, num_samples, positive_only, and min_weight. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 '{\"top_labels\":\"10\"}'","title":"Run Explanation"},{"location":"modelserving/explainer/aix/mnist/aix/#stopping-the-inference-service","text":"kubectl delete -f aix-explainer.yaml","title":"Stopping the Inference Service"},{"location":"modelserving/explainer/aix/mnist/aix/#build-a-development-aix-model-explainer-docker-image","text":"If you would like to build a development image for the AIX Model Explainer then follow these instructions","title":"Build a Development AIX Model Explainer Docker Image"},{"location":"modelserving/explainer/aix/mnist/aix/#troubleshooting","text":"<504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to aix-explainer.yaml and increase resources. Or to lower the number of allowed samples go to aix-explainer.yaml and add a flag to explainer: command: '--num_samples' (the default number of samples is 1000) If you see Configuration \"aixserver-explainer-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"Troubleshooting"},{"location":"modelserving/explainer/alibi/cifar10/","text":"CIFAR10 Image Classifier Explanations \u00b6 We will use a Tensorflow classifier built on CIFAR10 image dataset which is a 10 class image dataset to show the example of explanation on image data. Create the InferenceService with Alibi Explainer \u00b6 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cifar10\" spec : predictor : tensorflow : storageUri : \"gs://seldon-models/tfserving/cifar10/resnet32\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi explainer : containers : - name : kserve-container image : kserve/alibi-explainer:v0.12.1 args : - --model_name=cifar10 alibi : type : AnchorImages storageUri : \"gs://kfserving-examples/models/tensorflow/cifar/explainer-0.9.1\" config : batch_size : \"40\" stop_on_first : \"True\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi Note The InferenceService resource describes: A pretrained tensorflow model stored on a Google bucket An AnchorImage Seldon Alibi Explainer, see the Alibi Docs for further details. Test on notebook \u00b6 Run this example using the Jupyter notebook . Once created you will be able to test the predictions: And then get an explanation for it:","title":"Image Explainer"},{"location":"modelserving/explainer/alibi/cifar10/#cifar10-image-classifier-explanations","text":"We will use a Tensorflow classifier built on CIFAR10 image dataset which is a 10 class image dataset to show the example of explanation on image data.","title":"CIFAR10 Image Classifier Explanations"},{"location":"modelserving/explainer/alibi/cifar10/#create-the-inferenceservice-with-alibi-explainer","text":"apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cifar10\" spec : predictor : tensorflow : storageUri : \"gs://seldon-models/tfserving/cifar10/resnet32\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi explainer : containers : - name : kserve-container image : kserve/alibi-explainer:v0.12.1 args : - --model_name=cifar10 alibi : type : AnchorImages storageUri : \"gs://kfserving-examples/models/tensorflow/cifar/explainer-0.9.1\" config : batch_size : \"40\" stop_on_first : \"True\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi Note The InferenceService resource describes: A pretrained tensorflow model stored on a Google bucket An AnchorImage Seldon Alibi Explainer, see the Alibi Docs for further details.","title":"Create the InferenceService with Alibi Explainer"},{"location":"modelserving/explainer/alibi/cifar10/#test-on-notebook","text":"Run this example using the Jupyter notebook . Once created you will be able to test the predictions: And then get an explanation for it:","title":"Test on notebook"},{"location":"modelserving/explainer/alibi/income/","text":"Example Anchors Tabular Explaination for Income Prediction \u00b6 This example uses a US income dataset to show the example of explanation on tabular data. You can also try out the Jupyter notebook for a visual walkthrough. Create the InferenceService with alibi explainer \u00b6 We can create a InferenceService with a trained sklearn predictor for this dataset and an associated model explainer. The black box explainer algorithm we will use is the Tabular version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"income\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/model\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorTabular storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/explainer\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 4Gi Create the InferenceService with above yaml: kubectl kubectl create -f income.yaml The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = income SERVICE_HOSTNAME = $( kubectl get inferenceservice income -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Run the inference \u00b6 Test the predictor: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' You should receive the response showing the prediction is for low salary: Expected Output { \"predictions\" : [ 0 ]} Run the explanation \u00b6 Now lets get an explanation for this: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' The returned explanation will be like: Expected Output { \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"precision\" : 0.9724770642201835 , \"coverage\" : 0.0147 , \"raw\" : { \"feature\" : [ 3 , 1 ], \"mean\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"precision\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"coverage\" : [ 0.3327 , 0.0147 ], \"examples\" : [ { \"covered\" : [ [ 30 , \"Self-emp-not-inc\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Unmarried\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 69 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , 9386 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 59 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 55 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 55 , \"United-States\" ], [ 32 , \"?\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 32 , \"United-States\" ], [ 47 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Female\" , 6849 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 65 , \"United-States\" ], [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 48 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"covered_true\" : [ [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 36 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 56 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 49 , \"Local-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 20 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 10 , \"United-States\" ], [ 22 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , \"Hours per week > 45.00\" , \"United-States\" ], [ 29 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Own-child\" , \"Asian-Pac-Islander\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"SE-Asia\" ], [ 45 , \"Local-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Unmarried\" , \"White\" , \"Female\" , 1506 , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 27 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ] ], \"covered_false\" : [ [ 29 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , 7298 , \"Capital Loss <= 0.00\" , 42 , \"United-States\" ], [ 56 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Sales\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 47 , \"Private\" , \"Masters\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 27828 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 40 , \"Private\" , \"Associates\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 7688 , \"Capital Loss <= 0.00\" , 44 , \"United-States\" ], [ 55 , \"Self-emp-not-inc\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , 34095 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 53 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 48 , \"United-States\" ], [ 47 , \"Federal-gov\" , \"Doctorate\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 53 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , 1977 , 40 , \"United-States\" ], [ 46 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 8614 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Not-in-family\" , \"White\" , \"Male\" , 10520 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] }, { \"covered\" : [ [ 41 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 64 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 33 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"Black\" , \"Female\" , 1831 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 25 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Own-child\" , \"Black\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 40 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 19 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Other-relative\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ], [ 44 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 88 , \"United-States\" ], [ 80 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 24 , \"United-States\" ], [ 21 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Professional\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ] ], \"covered_true\" : [ [ 22 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 49 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 22 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 31 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 18 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 56 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 26 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 38 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 52 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 70 , \"United-States\" ], [ 25 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Professional\" , \"Wife\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , 1887 , 40 , \"United-States\" ] ], \"covered_false\" : [ [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 42 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 54 , \"State-gov\" , \"Doctorate\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 42 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , 14084 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 37 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"instance\" : [ [ 39 ], [ 7 ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ 4 ], [ \"28.00 < Age <= 37.00\" ], [ 2174 ], [ \"Age <= 28.00\" ], [ 40 ], [ 9 ] ], \"prediction\" : 0 } }","title":"Income Explainer"},{"location":"modelserving/explainer/alibi/income/#example-anchors-tabular-explaination-for-income-prediction","text":"This example uses a US income dataset to show the example of explanation on tabular data. You can also try out the Jupyter notebook for a visual walkthrough.","title":"Example Anchors Tabular Explaination for Income Prediction"},{"location":"modelserving/explainer/alibi/income/#create-the-inferenceservice-with-alibi-explainer","text":"We can create a InferenceService with a trained sklearn predictor for this dataset and an associated model explainer. The black box explainer algorithm we will use is the Tabular version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"income\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/model\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorTabular storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/explainer\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 4Gi Create the InferenceService with above yaml: kubectl kubectl create -f income.yaml The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = income SERVICE_HOSTNAME = $( kubectl get inferenceservice income -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Create the InferenceService with alibi explainer"},{"location":"modelserving/explainer/alibi/income/#run-the-inference","text":"Test the predictor: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' You should receive the response showing the prediction is for low salary: Expected Output { \"predictions\" : [ 0 ]}","title":"Run the inference"},{"location":"modelserving/explainer/alibi/income/#run-the-explanation","text":"Now lets get an explanation for this: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' The returned explanation will be like: Expected Output { \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"precision\" : 0.9724770642201835 , \"coverage\" : 0.0147 , \"raw\" : { \"feature\" : [ 3 , 1 ], \"mean\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"precision\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"coverage\" : [ 0.3327 , 0.0147 ], \"examples\" : [ { \"covered\" : [ [ 30 , \"Self-emp-not-inc\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Unmarried\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 69 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , 9386 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 59 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 55 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 55 , \"United-States\" ], [ 32 , \"?\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 32 , \"United-States\" ], [ 47 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Female\" , 6849 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 65 , \"United-States\" ], [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 48 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"covered_true\" : [ [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 36 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 56 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 49 , \"Local-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 20 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 10 , \"United-States\" ], [ 22 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , \"Hours per week > 45.00\" , \"United-States\" ], [ 29 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Own-child\" , \"Asian-Pac-Islander\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"SE-Asia\" ], [ 45 , \"Local-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Unmarried\" , \"White\" , \"Female\" , 1506 , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 27 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ] ], \"covered_false\" : [ [ 29 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , 7298 , \"Capital Loss <= 0.00\" , 42 , \"United-States\" ], [ 56 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Sales\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 47 , \"Private\" , \"Masters\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 27828 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 40 , \"Private\" , \"Associates\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 7688 , \"Capital Loss <= 0.00\" , 44 , \"United-States\" ], [ 55 , \"Self-emp-not-inc\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , 34095 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 53 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 48 , \"United-States\" ], [ 47 , \"Federal-gov\" , \"Doctorate\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 53 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , 1977 , 40 , \"United-States\" ], [ 46 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 8614 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Not-in-family\" , \"White\" , \"Male\" , 10520 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] }, { \"covered\" : [ [ 41 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 64 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 33 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"Black\" , \"Female\" , 1831 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 25 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Own-child\" , \"Black\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 40 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 19 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Other-relative\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ], [ 44 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 88 , \"United-States\" ], [ 80 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 24 , \"United-States\" ], [ 21 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Professional\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ] ], \"covered_true\" : [ [ 22 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 49 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 22 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 31 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 18 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 56 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 26 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 38 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 52 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 70 , \"United-States\" ], [ 25 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Professional\" , \"Wife\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , 1887 , 40 , \"United-States\" ] ], \"covered_false\" : [ [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 42 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 54 , \"State-gov\" , \"Doctorate\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 42 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , 14084 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 37 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"instance\" : [ [ 39 ], [ 7 ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ 4 ], [ \"28.00 < Age <= 37.00\" ], [ 2174 ], [ \"Age <= 28.00\" ], [ 40 ], [ 9 ] ], \"prediction\" : 0 } }","title":"Run the explanation"},{"location":"modelserving/explainer/alibi/moviesentiment/","text":"Example Anchors Text Explaination for Movie Sentiment \u00b6 This example uses a movie sentiment dataset to show the explanation on text data, for a more visual walkthrough please try the Jupyter notebook . Deploy InferenceService with AnchorText Explainer \u00b6 We can create a InferenceService with a trained sklearn predictor for this dataset and an associated explainer. The black box explainer algorithm we will use is the Text version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorText resources : requests : cpu : 0.1 memory : 6Gi limits : memory : 6Gi Create this InferenceService: kubectl kubectl create -f moviesentiment.yaml Run Inference and Explanation \u00b6 Set up some environment variables for the model name and cluster entrypoint. MODEL_NAME = moviesentiment SERVICE_HOSTNAME = $( kubectl get inferenceservice moviesentiment -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Test the predictor on an example sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' You should receive the response showing negative sentiment: Expected Output { \"predictions\" : [ 0 ]} Test on another sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a touching , sophisticated film that almost seems like a documentary in the way it captures an italian immigrant family on the brink of major changes .\"]}' You should receive the response showing positive sentiment: Expected Output { \"predictions\" : [ 1 ]} Now lets get an explanation for the first sentence: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 1 , \"converage\" : 0.5005 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 1 ], \"precision\" : [ 1 ], \"coverage\" : [ 0.5005 ], \"examples\" : [ { \"covered\" : [ [ \"a visually UNK UNK UNK opaque and emotionally vapid exercise UNK\" ], [ \"a visually flashy but UNK UNK and emotionally UNK exercise .\" ], [ \"a visually flashy but narratively UNK UNK UNK UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"UNK UNK UNK but UNK opaque UNK emotionally UNK exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK emotionally UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise UNK\" ], [ \"a visually UNK but narratively opaque UNK UNK vapid exercise UNK\" ] ], \"covered_true\" : [ [ \"UNK visually flashy but UNK UNK and emotionally vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK UNK exercise .\" ], [ \"a UNK UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a visually UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a UNK UNK UNK UNK UNK and emotionally vapid exercise UNK\" ], [ \"a UNK flashy UNK narratively UNK and UNK vapid exercise UNK\" ], [ \"UNK visually UNK UNK narratively UNK and emotionally UNK exercise .\" ], [ \"UNK visually flashy UNK narratively opaque UNK emotionally UNK exercise UNK\" ], [ \"UNK UNK flashy UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ] ], \"covered_false\" : [], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } This shows the key word \"bad\" was identified and examples show it in context using the default \"UKN\" placeholder for surrounding words. Custom Configuration \u00b6 You can add custom configuration for the Anchor Text explainer in the 'config' section. For example, we can change the text explainer to sample from the corpus rather than use UKN placeholders: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 explainer : alibi : type : AnchorText config : use_unk : \"false\" sample_proba : \"0.5\" resources : requests : cpu : 0.1 If we apply this: kubectl kubectl create -f moviesentiment2.yaml and then ask for an explanation: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 0.9918032786885246 , \"coverage\" : 0.5072 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 0.9918032786885246 ], \"precision\" : [ 0.9918032786885246 ], \"coverage\" : [ 0.5072 ], \"examples\" : [ { \"covered\" : [ [ \"each visually playful but enormously opaque and academically vapid exercise .\" ], [ \"each academically trashy but narratively pigmented and profoundly vapid exercise .\" ], [ \"a masterfully flashy but narratively straightforward and verbally disingenuous exercise .\" ], [ \"a visually gaudy but interestingly opaque and emotionally vapid exercise .\" ], [ \"some concurrently flashy but philosophically voxel and emotionally vapid exercise .\" ], [ \"a visually flashy but delightfully sensible and emotionally snobby exercise .\" ], [ \"a surprisingly bland but fantastically seamless and hideously vapid exercise .\" ], [ \"both visually classy but nonetheless robust and musically vapid exercise .\" ], [ \"a visually fancy but narratively robust and emotionally uninformed exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ] ], \"covered_true\" : [ [ \"another visually flashy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually classy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually arty but overshadow yellowish and emotionally vapid exercise .\" ], [ \"a objectively flashy but genuinely straightforward and emotionally vapid exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ], [ \"a emotionally crafty but narratively opaque and emotionally vapid exercise .\" ], [ \"some similarly eclectic but narratively dainty and emotionally illogical exercise .\" ], [ \"a nicely flashy but psychologically opaque and emotionally vapid exercise .\" ], [ \"a visually flashy but narratively colorless and emotionally vapid exercise .\" ], [ \"every properly lavish but logistically opaque and someway incomprehensible exercise .\" ] ], \"covered_false\" : [ [ \"another enormously inventive but socially opaque and somewhat idiotic exercise .\" ], [ \"each visually playful but enormously opaque and academically vapid exercise .\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } Run on Notebook \u00b6 You can also run this example on notebook","title":"Text Explainer"},{"location":"modelserving/explainer/alibi/moviesentiment/#example-anchors-text-explaination-for-movie-sentiment","text":"This example uses a movie sentiment dataset to show the explanation on text data, for a more visual walkthrough please try the Jupyter notebook .","title":"Example Anchors Text Explaination for Movie Sentiment"},{"location":"modelserving/explainer/alibi/moviesentiment/#deploy-inferenceservice-with-anchortext-explainer","text":"We can create a InferenceService with a trained sklearn predictor for this dataset and an associated explainer. The black box explainer algorithm we will use is the Text version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorText resources : requests : cpu : 0.1 memory : 6Gi limits : memory : 6Gi Create this InferenceService: kubectl kubectl create -f moviesentiment.yaml","title":"Deploy InferenceService with AnchorText Explainer"},{"location":"modelserving/explainer/alibi/moviesentiment/#run-inference-and-explanation","text":"Set up some environment variables for the model name and cluster entrypoint. MODEL_NAME = moviesentiment SERVICE_HOSTNAME = $( kubectl get inferenceservice moviesentiment -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Test the predictor on an example sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' You should receive the response showing negative sentiment: Expected Output { \"predictions\" : [ 0 ]} Test on another sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a touching , sophisticated film that almost seems like a documentary in the way it captures an italian immigrant family on the brink of major changes .\"]}' You should receive the response showing positive sentiment: Expected Output { \"predictions\" : [ 1 ]} Now lets get an explanation for the first sentence: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 1 , \"converage\" : 0.5005 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 1 ], \"precision\" : [ 1 ], \"coverage\" : [ 0.5005 ], \"examples\" : [ { \"covered\" : [ [ \"a visually UNK UNK UNK opaque and emotionally vapid exercise UNK\" ], [ \"a visually flashy but UNK UNK and emotionally UNK exercise .\" ], [ \"a visually flashy but narratively UNK UNK UNK UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"UNK UNK UNK but UNK opaque UNK emotionally UNK exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK emotionally UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise UNK\" ], [ \"a visually UNK but narratively opaque UNK UNK vapid exercise UNK\" ] ], \"covered_true\" : [ [ \"UNK visually flashy but UNK UNK and emotionally vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK UNK exercise .\" ], [ \"a UNK UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a visually UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a UNK UNK UNK UNK UNK and emotionally vapid exercise UNK\" ], [ \"a UNK flashy UNK narratively UNK and UNK vapid exercise UNK\" ], [ \"UNK visually UNK UNK narratively UNK and emotionally UNK exercise .\" ], [ \"UNK visually flashy UNK narratively opaque UNK emotionally UNK exercise UNK\" ], [ \"UNK UNK flashy UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ] ], \"covered_false\" : [], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } This shows the key word \"bad\" was identified and examples show it in context using the default \"UKN\" placeholder for surrounding words.","title":"Run Inference and Explanation"},{"location":"modelserving/explainer/alibi/moviesentiment/#custom-configuration","text":"You can add custom configuration for the Anchor Text explainer in the 'config' section. For example, we can change the text explainer to sample from the corpus rather than use UKN placeholders: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 explainer : alibi : type : AnchorText config : use_unk : \"false\" sample_proba : \"0.5\" resources : requests : cpu : 0.1 If we apply this: kubectl kubectl create -f moviesentiment2.yaml and then ask for an explanation: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 0.9918032786885246 , \"coverage\" : 0.5072 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 0.9918032786885246 ], \"precision\" : [ 0.9918032786885246 ], \"coverage\" : [ 0.5072 ], \"examples\" : [ { \"covered\" : [ [ \"each visually playful but enormously opaque and academically vapid exercise .\" ], [ \"each academically trashy but narratively pigmented and profoundly vapid exercise .\" ], [ \"a masterfully flashy but narratively straightforward and verbally disingenuous exercise .\" ], [ \"a visually gaudy but interestingly opaque and emotionally vapid exercise .\" ], [ \"some concurrently flashy but philosophically voxel and emotionally vapid exercise .\" ], [ \"a visually flashy but delightfully sensible and emotionally snobby exercise .\" ], [ \"a surprisingly bland but fantastically seamless and hideously vapid exercise .\" ], [ \"both visually classy but nonetheless robust and musically vapid exercise .\" ], [ \"a visually fancy but narratively robust and emotionally uninformed exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ] ], \"covered_true\" : [ [ \"another visually flashy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually classy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually arty but overshadow yellowish and emotionally vapid exercise .\" ], [ \"a objectively flashy but genuinely straightforward and emotionally vapid exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ], [ \"a emotionally crafty but narratively opaque and emotionally vapid exercise .\" ], [ \"some similarly eclectic but narratively dainty and emotionally illogical exercise .\" ], [ \"a nicely flashy but psychologically opaque and emotionally vapid exercise .\" ], [ \"a visually flashy but narratively colorless and emotionally vapid exercise .\" ], [ \"every properly lavish but logistically opaque and someway incomprehensible exercise .\" ] ], \"covered_false\" : [ [ \"another enormously inventive but socially opaque and somewhat idiotic exercise .\" ], [ \"each visually playful but enormously opaque and academically vapid exercise .\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } }","title":"Custom Configuration"},{"location":"modelserving/explainer/alibi/moviesentiment/#run-on-notebook","text":"You can also run this example on notebook","title":"Run on Notebook"},{"location":"modelserving/explainer/trustyai/","text":"TrustyAI explainer \u00b6 This is an example of how to use the TrustyAI project 's KServe custom explainer. The TrustyAI KServe explainer includes two explainer types: LIME (Local Interpretable Model-agnostic Explanations) and SHAP (SHapley Additive exPlanations) supporting tabular data models. In this example, we will use the \"California Housing Dataset\". This dataset is available as part of scikit-learn and targets the median house value for California districts (scaled to 100k USD units), from eight input features including median income, average number of rooms per household and block group population. For example purposes, we trained a Random Forest model. To deploy it and enable the TrustyAI explainer, use the following InferenceService apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"housing\" spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/trustyai-explainability/model-collection/raw/main/housing-data/model.joblib explainer : containers : - name : explainer image : quay.io/trustyai/trustyai-kserve-explainer:latest kubectl export NAMESPACE = \"kserve-explainer\" kubectl create namespace ${ NAMESPACE } kubectl create -f housing.yaml -n ${ NAMESPACE } Expected Output $ namespace/kserve-explainer created $ inferenceservice.serving.kserve.io/housing created To verify that the InferenceService is deployed you can run: kubectl kubectl get pods -n ${ NAMESPACE } Expected Output NAME READY STATUS RESTARTS AGE housing-explainer-00001-deployment-75c56fdc65-wc2p5 2 /2 Running 0 4m13s housing-predictor-00001-deployment-85fc685954-lp8z2 2 /2 Running 0 4m13s Once the InferenceService is deployed and ready, we can start by issuing an inference request. Here we will assume that the gateway service is available via port-forward, for simplicity. Port Forward Alternatively you can do Port Forward for testing purposes. INGRESS_GATEWAY_SERVICE = $( kubectl get svc --namespace istio-system --selector = \"app=istio-ingressgateway\" --output jsonpath = '{.items[0].metadata.name}' ) kubectl port-forward --namespace istio-system svc/ ${ INGRESS_GATEWAY_SERVICE } 8080 :80 Open another terminal, and enter the following: export INGRESS_HOST = localhost export INGRESS_PORT = 8080 export SERVICE_HOSTNAME = $( kubectl get inferenceservice housing -n $NAMESPACE -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) We can create a payload JSON file payload.json with the following contents as an example: { \"instances\" : [ [ 6.6227 , 20.0 , 6.282147315855181 , 1.0087390761548065 , 2695.0 , 3.3645443196004994 , 37.42 , -121.86 ] ] } curl -sv -X POST -H \"Content-Type: application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/housing:predict \\ -d @payload.json We will get the following result, indicating a predicted house value of approximately $291k. Expected Output { \"predictions\" :[ 2.9168394017053823 ]} Requesting explanations \u00b6 By default, the TrustyAI explainer returns both a SHAP and a LIME explanation. These explanations will consist of a feature saliency map in the case of LIME, and a breakdown of individual feature contributions to the final result (relative to a dataset background value) for SHAP. The way in which the TrustyAI explainer creates the background dataset (especially in the \"cold start\" case in detailed in the SHAP background generation section). We can request the explanation by using the same payload and the :explain endpoint. curl -sv -X POST -H \"Content-Type: application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/housing:explain \\ -d @payload.json Expected Output { \"timestamp\" : \"2024-08-14T10:39:25.439+00:00\" , \"type\" : \"explanation\" , \"saliencies\" : { \"LIME\" : { \"outputs-0\" : [ { \"name\" : \"inputs-4\" , \"score\" : 0.8752282972400102 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-1\" , \"score\" : 0.8510641096679439 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-7\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-5\" , \"score\" : 0.06190946209542338 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-3\" , \"score\" : 0.045357719680479414 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-2\" , \"score\" : 0.04814487416008339 , \"confidence\" : 0.0 } ] }, \"SHAP\" : { \"outputs-0\" : [ { \"name\" : \"inputs-0\" , \"score\" : -0.0883804018056985 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-1\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-2\" , \"score\" : 0.05919873703962664 , \"confidence\" : 0.04304153489780624 }, { \"name\" : \"inputs-3\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-4\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-5\" , \"score\" : -0.2214697499218062 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-6\" , \"score\" : 0.056336605210644264 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-7\" , \"score\" : 0.054585492932784946 , \"confidence\" : 0.0860830697956125 }, { \"name\" : \"Background\" , \"score\" : 3.056568718249831 , \"confidence\" : 0.0 } ] } } } Additional configuration \u00b6 Additional explainer configuration can be made via environment variables on the InferenceService . For instance, to have only SHAP explanations we could use: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"housing\" spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/trustyai-explainability/model-collection/raw/main/housing-data/model.joblib explainer : containers : - name : explainer image : quay.io/trustyai/trustyai-kserve-explainer:latest env : - name : EXPLAINER_TYPE value : \"SHAP\" To have only LIME, simply replace \"SHAP\" with \"LIME\" . Through environment variables, we can also configure several explainer parameters such as Number of samples used by the explainers to explore the decision boundary Use of weighted liner regressions Weight normalization SHAP background data size SHAP backbround diversity A full list of the available configuration options is available at the explainer's repository . SHAP background generation \u00b6 For SHAP to produce meaningful explanations, it requires a diverse set of baseline data (or \"background\" data). This can be problematic in the \"cold start\" scenario, where very few observations are available. The TrustyAI explainer tries to mitigate this problem by keeping a fixed size storage of past data (size configurable via EXPLAINER_SHAP_BACKGROUND_QUEUE ) and populating the missing observations that make up the queue with samples from an empirical distribution created from the data observed so far. The more data is passed to the InferenceService , the less the explainer will rely on synthetic data, keeping only a number of synthetic samples for diversity purposes (configurable with EXPLAINER_SHAP_BACKGROUND_DIVERSITY ).","title":"TrustyAI Explainer"},{"location":"modelserving/explainer/trustyai/#trustyai-explainer","text":"This is an example of how to use the TrustyAI project 's KServe custom explainer. The TrustyAI KServe explainer includes two explainer types: LIME (Local Interpretable Model-agnostic Explanations) and SHAP (SHapley Additive exPlanations) supporting tabular data models. In this example, we will use the \"California Housing Dataset\". This dataset is available as part of scikit-learn and targets the median house value for California districts (scaled to 100k USD units), from eight input features including median income, average number of rooms per household and block group population. For example purposes, we trained a Random Forest model. To deploy it and enable the TrustyAI explainer, use the following InferenceService apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"housing\" spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/trustyai-explainability/model-collection/raw/main/housing-data/model.joblib explainer : containers : - name : explainer image : quay.io/trustyai/trustyai-kserve-explainer:latest kubectl export NAMESPACE = \"kserve-explainer\" kubectl create namespace ${ NAMESPACE } kubectl create -f housing.yaml -n ${ NAMESPACE } Expected Output $ namespace/kserve-explainer created $ inferenceservice.serving.kserve.io/housing created To verify that the InferenceService is deployed you can run: kubectl kubectl get pods -n ${ NAMESPACE } Expected Output NAME READY STATUS RESTARTS AGE housing-explainer-00001-deployment-75c56fdc65-wc2p5 2 /2 Running 0 4m13s housing-predictor-00001-deployment-85fc685954-lp8z2 2 /2 Running 0 4m13s Once the InferenceService is deployed and ready, we can start by issuing an inference request. Here we will assume that the gateway service is available via port-forward, for simplicity. Port Forward Alternatively you can do Port Forward for testing purposes. INGRESS_GATEWAY_SERVICE = $( kubectl get svc --namespace istio-system --selector = \"app=istio-ingressgateway\" --output jsonpath = '{.items[0].metadata.name}' ) kubectl port-forward --namespace istio-system svc/ ${ INGRESS_GATEWAY_SERVICE } 8080 :80 Open another terminal, and enter the following: export INGRESS_HOST = localhost export INGRESS_PORT = 8080 export SERVICE_HOSTNAME = $( kubectl get inferenceservice housing -n $NAMESPACE -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) We can create a payload JSON file payload.json with the following contents as an example: { \"instances\" : [ [ 6.6227 , 20.0 , 6.282147315855181 , 1.0087390761548065 , 2695.0 , 3.3645443196004994 , 37.42 , -121.86 ] ] } curl -sv -X POST -H \"Content-Type: application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/housing:predict \\ -d @payload.json We will get the following result, indicating a predicted house value of approximately $291k. Expected Output { \"predictions\" :[ 2.9168394017053823 ]}","title":"TrustyAI explainer"},{"location":"modelserving/explainer/trustyai/#requesting-explanations","text":"By default, the TrustyAI explainer returns both a SHAP and a LIME explanation. These explanations will consist of a feature saliency map in the case of LIME, and a breakdown of individual feature contributions to the final result (relative to a dataset background value) for SHAP. The way in which the TrustyAI explainer creates the background dataset (especially in the \"cold start\" case in detailed in the SHAP background generation section). We can request the explanation by using the same payload and the :explain endpoint. curl -sv -X POST -H \"Content-Type: application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/housing:explain \\ -d @payload.json Expected Output { \"timestamp\" : \"2024-08-14T10:39:25.439+00:00\" , \"type\" : \"explanation\" , \"saliencies\" : { \"LIME\" : { \"outputs-0\" : [ { \"name\" : \"inputs-4\" , \"score\" : 0.8752282972400102 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-1\" , \"score\" : 0.8510641096679439 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-7\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-5\" , \"score\" : 0.06190946209542338 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-3\" , \"score\" : 0.045357719680479414 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-2\" , \"score\" : 0.04814487416008339 , \"confidence\" : 0.0 } ] }, \"SHAP\" : { \"outputs-0\" : [ { \"name\" : \"inputs-0\" , \"score\" : -0.0883804018056985 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-1\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-2\" , \"score\" : 0.05919873703962664 , \"confidence\" : 0.04304153489780624 }, { \"name\" : \"inputs-3\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-4\" , \"score\" : 0.0 , \"confidence\" : 0.0 }, { \"name\" : \"inputs-5\" , \"score\" : -0.2214697499218062 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-6\" , \"score\" : 0.056336605210644264 , \"confidence\" : 0.04304153489780625 }, { \"name\" : \"inputs-7\" , \"score\" : 0.054585492932784946 , \"confidence\" : 0.0860830697956125 }, { \"name\" : \"Background\" , \"score\" : 3.056568718249831 , \"confidence\" : 0.0 } ] } } }","title":"Requesting explanations"},{"location":"modelserving/explainer/trustyai/#additional-configuration","text":"Additional explainer configuration can be made via environment variables on the InferenceService . For instance, to have only SHAP explanations we could use: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"housing\" spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/trustyai-explainability/model-collection/raw/main/housing-data/model.joblib explainer : containers : - name : explainer image : quay.io/trustyai/trustyai-kserve-explainer:latest env : - name : EXPLAINER_TYPE value : \"SHAP\" To have only LIME, simply replace \"SHAP\" with \"LIME\" . Through environment variables, we can also configure several explainer parameters such as Number of samples used by the explainers to explore the decision boundary Use of weighted liner regressions Weight normalization SHAP background data size SHAP backbround diversity A full list of the available configuration options is available at the explainer's repository .","title":"Additional configuration"},{"location":"modelserving/explainer/trustyai/#shap-background-generation","text":"For SHAP to produce meaningful explanations, it requires a diverse set of baseline data (or \"background\" data). This can be problematic in the \"cold start\" scenario, where very few observations are available. The TrustyAI explainer tries to mitigate this problem by keeping a fixed size storage of past data (size configurable via EXPLAINER_SHAP_BACKGROUND_QUEUE ) and populating the missing observations that make up the queue with samples from an empirical distribution created from the data observed so far. The more data is passed to the InferenceService , the less the explainer will rely on synthetic data, keeping only a number of synthetic samples for diversity purposes (configurable with EXPLAINER_SHAP_BACKGROUND_DIVERSITY ).","title":"SHAP background generation"},{"location":"modelserving/inference_graph/","text":"Inference Graph \u00b6 Motivation \u00b6 ML inference systems are getting bigger and more complex, they often consist of many models to make a single prediction. Some common use cases are image classification and natural language processing pipelines. For example, a face recognition pipeline may need to first locate faces in a image, then compute the features of the faces to match records in a database. An NLP pipeline needs to first run document classification, then perform named entity detection downstream based on the previous classification results. KServe has unique strengths for building a distributed inference graph: an autoscaling graph router, native integration with individual InferenceServices , and a standard inference protocol for chaining models. KServe leverages these strengths to build an InferenceGraph and enable users to deploy complex ML inference pipelines to production in a declarative and scalable way. Concepts \u00b6 InferenceGraph : Made up of a list of routing Nodes , where each Node consists of a set of routing Steps . Each Step can either route to an InferenceService or another Node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of Routing Nodes : Sequence , Switch , Ensemble , Splitter . Sequence Node : Allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step can be passed to the next step as input based on configuration. Switch Node : Enables users to define routing conditions and select a step to execute if it matches a condition. The response is returned as soon it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combining the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : Allows users to split the traffic to multiple targets using a weighted distribution. Features \u00b6 Headers Propagation \u00b6 If you want Inference Graph's router to propagate the headers, you passed in the request to Inference Graph, to all the steps in your graph then you can do so using inferenceservice-config config-map in kserve namespace. For example: If you want to propagate a certain header, say \"Custom-Header\", then you can edit the router section of inferenceservice-config config-map like this : { \"image\" : \"kserve/router:v0.11.0\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"headers\" : { \"propagate\" :[ \"Custom-Header\" ] } } Once you update this config-map, kserve controller will automatically reconcile Inference Graph to start propagating headers.","title":"Concept"},{"location":"modelserving/inference_graph/#inference-graph","text":"","title":"Inference Graph"},{"location":"modelserving/inference_graph/#motivation","text":"ML inference systems are getting bigger and more complex, they often consist of many models to make a single prediction. Some common use cases are image classification and natural language processing pipelines. For example, a face recognition pipeline may need to first locate faces in a image, then compute the features of the faces to match records in a database. An NLP pipeline needs to first run document classification, then perform named entity detection downstream based on the previous classification results. KServe has unique strengths for building a distributed inference graph: an autoscaling graph router, native integration with individual InferenceServices , and a standard inference protocol for chaining models. KServe leverages these strengths to build an InferenceGraph and enable users to deploy complex ML inference pipelines to production in a declarative and scalable way.","title":"Motivation"},{"location":"modelserving/inference_graph/#concepts","text":"InferenceGraph : Made up of a list of routing Nodes , where each Node consists of a set of routing Steps . Each Step can either route to an InferenceService or another Node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of Routing Nodes : Sequence , Switch , Ensemble , Splitter . Sequence Node : Allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step can be passed to the next step as input based on configuration. Switch Node : Enables users to define routing conditions and select a step to execute if it matches a condition. The response is returned as soon it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combining the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : Allows users to split the traffic to multiple targets using a weighted distribution.","title":"Concepts"},{"location":"modelserving/inference_graph/#features","text":"","title":"Features"},{"location":"modelserving/inference_graph/#headers-propagation","text":"If you want Inference Graph's router to propagate the headers, you passed in the request to Inference Graph, to all the steps in your graph then you can do so using inferenceservice-config config-map in kserve namespace. For example: If you want to propagate a certain header, say \"Custom-Header\", then you can edit the router section of inferenceservice-config config-map like this : { \"image\" : \"kserve/router:v0.11.0\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"headers\" : { \"propagate\" :[ \"Custom-Header\" ] } } Once you update this config-map, kserve controller will automatically reconcile Inference Graph to start propagating headers.","title":"Headers Propagation"},{"location":"modelserving/inference_graph/image_pipeline/","text":"Deploy Image Processing Inference pipeline with InferenceGraph \u00b6 The tutorial demonstrates how to deploy an image processing inference pipeline with multiple stages using InferenceGraph . The example chains the two models, the first model is to classify if an image is a dog or a cat, if it is a dog the second model then does the dog breed classification. InferenceGraph Flow \u00b6 In the InferenceGraph request flow, the image is encoded with base64 format and first sent to the dog-cat-classifier model, the image input for the dog-cat-classifier InferenceService are then forwarded to send to the model on the next stage to classify the breed if the previous model prediction is a dog. Deploy the individual InferenceServices \u00b6 Train the models \u00b6 You can refer to dog-cat classification and dog breed classification to train the image classifier models for different stages. Deploy the InferenceServices \u00b6 Before deploying the graph router with InferenceGraph custom resource, you need to first deploy the individual InferenceServices with the models trained from previous step. The models should be packaged with the following commands and then upload to your model storage along with the configuration : torch-model-archiver -f --model-name cat_dog_classification --version 1 .0 \\ --model-file cat_dog_classification_arch.py \\ --serialized-file cat_dog_classification.pth \\ --handler cat_dog_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store torch-model-archiver -f --model-name dog_breed_classification --version 1 .0 \\ --model-file dog_breed_classification_arch.py \\ --serialized-file dog_breed_classification.pth \\ --handler dog_breed_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store You can then deploy the models to KServe with following InferenceService custom resources. New Schema Old Schema kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"cat-dog-classifier\" spec: predictor: model: modelFormat: name: pytorch resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"dog-breed-classifier\" spec: predictor: model: modelFormat: name: pytorch resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/dog_breed_classification EOF kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"cat-dog-classifier\" spec: predictor: pytorch: resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"dog-breed-classifier\" spec: predictor: pytorch: resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/dog_breed_classification EOF Please check more details on PyTorch Tutorial for how to package the model and deploy with InferenceService . Deploy InferenceGraph \u00b6 After the InferenceServices are in ready state, you can now deploy the InferenceGraph to chain these two models to produce the final inference result. InferenceGraph kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1alpha1\" kind: \"InferenceGraph\" metadata: name: \"dog-breed-pipeline\" spec: nodes: root: routerType: Sequence steps: - serviceName: cat-dog-classifier name: cat_dog_classifier # step name - serviceName: dog-breed-classifier name: dog_breed_classifier data: $request condition: \"[@this].#(predictions.0==\\\"dog\\\")\" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi EOF Note For more information on InferenceGraph Spec, See the reference docs . The InferenceGraph defines the two steps and each step targets the InferenceServices deployed above. The steps are executed in sequence: it first sends the image as request to cat-dog-classifier model and then send to the dog-breed-classifier if it is classified as a dog from the first model. Note that $request is specified on the data field to indicate that you want to forward the request from the previous step and send as input to the next step. condition is specified on the second step so that the request is only sent to the current step if the response data matches the defined condition. When the condition is not matched the graph short circuits and returns the response from the previous step. Refer to gjson syntax for how to express the condition and currently KServe only supports this with REST protocol. Test the InferenceGraph \u00b6 Before testing the InferenceGraph , first check if the graph is in the ready state and then get the router url for sending the request. kubectl get ig dog-breed-pipeline NAME URL READY AGE dog-breed-pipeline http://dog-breed-pipeline.default.example.com True 17h The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Now, you can test the inference graph by sending the cat and dog image data . SERVICE_HOSTNAME = $( kubectl get inferencegraph dog-breed-pipeline -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } -d @./cat.json Expected Output { \"predictions\" : [ \"It's a cat!\" ]} curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } -d @./dog.json Expected Output { \"predictions\" : [{ \"Kuvasz\" : 0.9854059219360352 , \"American_water_spaniel\" : 0.006928909569978714 , \"Glen_of_imaal_terrier\" : 0.004635687451809645 , \"Manchester_terrier\" : 0.0011041086399927735 , \"American_eskimo_dog\" : 0.0003261661622673273 }]} You can see that if the first model classifies the image as dog it then sends to the second model and further classifies the dog breed, if the image is classified as cat the InferenceGraph router returns the response from the first model.","title":"Image classification inference graph"},{"location":"modelserving/inference_graph/image_pipeline/#deploy-image-processing-inference-pipeline-with-inferencegraph","text":"The tutorial demonstrates how to deploy an image processing inference pipeline with multiple stages using InferenceGraph . The example chains the two models, the first model is to classify if an image is a dog or a cat, if it is a dog the second model then does the dog breed classification.","title":"Deploy Image Processing Inference pipeline with InferenceGraph"},{"location":"modelserving/inference_graph/image_pipeline/#inferencegraph-flow","text":"In the InferenceGraph request flow, the image is encoded with base64 format and first sent to the dog-cat-classifier model, the image input for the dog-cat-classifier InferenceService are then forwarded to send to the model on the next stage to classify the breed if the previous model prediction is a dog.","title":"InferenceGraph Flow"},{"location":"modelserving/inference_graph/image_pipeline/#deploy-the-individual-inferenceservices","text":"","title":"Deploy the individual InferenceServices"},{"location":"modelserving/inference_graph/image_pipeline/#train-the-models","text":"You can refer to dog-cat classification and dog breed classification to train the image classifier models for different stages.","title":"Train the models"},{"location":"modelserving/inference_graph/image_pipeline/#deploy-the-inferenceservices","text":"Before deploying the graph router with InferenceGraph custom resource, you need to first deploy the individual InferenceServices with the models trained from previous step. The models should be packaged with the following commands and then upload to your model storage along with the configuration : torch-model-archiver -f --model-name cat_dog_classification --version 1 .0 \\ --model-file cat_dog_classification_arch.py \\ --serialized-file cat_dog_classification.pth \\ --handler cat_dog_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store torch-model-archiver -f --model-name dog_breed_classification --version 1 .0 \\ --model-file dog_breed_classification_arch.py \\ --serialized-file dog_breed_classification.pth \\ --handler dog_breed_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store You can then deploy the models to KServe with following InferenceService custom resources. New Schema Old Schema kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"cat-dog-classifier\" spec: predictor: model: modelFormat: name: pytorch resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"dog-breed-classifier\" spec: predictor: model: modelFormat: name: pytorch resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/dog_breed_classification EOF kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"cat-dog-classifier\" spec: predictor: pytorch: resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"dog-breed-classifier\" spec: predictor: pytorch: resources: requests: cpu: 100m storageUri: gs://kfserving-examples/models/torchserve/dog_breed_classification EOF Please check more details on PyTorch Tutorial for how to package the model and deploy with InferenceService .","title":"Deploy the InferenceServices"},{"location":"modelserving/inference_graph/image_pipeline/#deploy-inferencegraph","text":"After the InferenceServices are in ready state, you can now deploy the InferenceGraph to chain these two models to produce the final inference result. InferenceGraph kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1alpha1\" kind: \"InferenceGraph\" metadata: name: \"dog-breed-pipeline\" spec: nodes: root: routerType: Sequence steps: - serviceName: cat-dog-classifier name: cat_dog_classifier # step name - serviceName: dog-breed-classifier name: dog_breed_classifier data: $request condition: \"[@this].#(predictions.0==\\\"dog\\\")\" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi EOF Note For more information on InferenceGraph Spec, See the reference docs . The InferenceGraph defines the two steps and each step targets the InferenceServices deployed above. The steps are executed in sequence: it first sends the image as request to cat-dog-classifier model and then send to the dog-breed-classifier if it is classified as a dog from the first model. Note that $request is specified on the data field to indicate that you want to forward the request from the previous step and send as input to the next step. condition is specified on the second step so that the request is only sent to the current step if the response data matches the defined condition. When the condition is not matched the graph short circuits and returns the response from the previous step. Refer to gjson syntax for how to express the condition and currently KServe only supports this with REST protocol.","title":"Deploy InferenceGraph"},{"location":"modelserving/inference_graph/image_pipeline/#test-the-inferencegraph","text":"Before testing the InferenceGraph , first check if the graph is in the ready state and then get the router url for sending the request. kubectl get ig dog-breed-pipeline NAME URL READY AGE dog-breed-pipeline http://dog-breed-pipeline.default.example.com True 17h The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Now, you can test the inference graph by sending the cat and dog image data . SERVICE_HOSTNAME = $( kubectl get inferencegraph dog-breed-pipeline -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } -d @./cat.json Expected Output { \"predictions\" : [ \"It's a cat!\" ]} curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } -d @./dog.json Expected Output { \"predictions\" : [{ \"Kuvasz\" : 0.9854059219360352 , \"American_water_spaniel\" : 0.006928909569978714 , \"Glen_of_imaal_terrier\" : 0.004635687451809645 , \"Manchester_terrier\" : 0.0011041086399927735 , \"American_eskimo_dog\" : 0.0003261661622673273 }]} You can see that if the first model classifies the image as dog it then sends to the second model and further classifies the dog breed, if the image is classified as cat the InferenceGraph router returns the response from the first model.","title":"Test the InferenceGraph"},{"location":"modelserving/kafka/kafka/","text":"End to end inference service example with Minio and Kafka \u00b6 This example shows an end to end inference pipeline which processes an kafka event and invoke the inference service to get the prediction with provided pre/post processing code. The code for this example can be found in the kafka sample folder in the KServe repository . Deploy Kafka \u00b6 If you do not have an existing kafka cluster, you can run the following commands to install in-cluster kafka and zookeeper using helm3 with persistence turned off. helm repo add bitnami https://charts.bitnami.com/bitnami helm install zookeeper bitnami/zookeeper --set replicaCount = 1 --set auth.enabled = false --set allowAnonymousLogin = true \\ --set persistance.enabled = false --version 11 .0.0 helm install kafka bitnami/kafka --set zookeeper.enabled = false --set replicaCount = 1 --set persistance.enabled = false \\ --set logPersistance.enabled = false --set externalZookeeper.servers = zookeeper-headless.default.svc.cluster.local \\ --version 21 .0.0 After successful install you are expected to see the running kafka cluster Expected Output NAME READY STATUS RESTARTS AGE kafka-0 1 /1 Running 0 126m zookeeper-0 1 /1 Running 0 127m Install Knative Eventing and Kafka Event Source \u00b6 Install Knative Eventing Core >= 1.2 kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-crds.yaml kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-core.yaml Install Kafka Event Source . kubectl apply -f https://github.com/knative-sandbox/eventing-kafka/releases/download/knative-v1.9.1/source.yaml Install InferenceService addressable cluster role cat <<EOF | kubectl apply -f - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: inferenceservice-addressable-resolver labels: contrib.eventing.knative.dev/release: devel duck.knative.dev/addressable: \"true\" # Do not use this role directly. These rules will be added to the \"addressable-resolver\" role. rules: - apiGroups: - serving.kserve.io resources: - inferenceservices - inferenceservices/status verbs: - get - list - watch EOF Deploy Minio \u00b6 If you do not have Minio setup in your cluster, you can run following command to install Minio test instance. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: labels: app: minio name: minio spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: app: minio strategy: type: Recreate template: metadata: labels: app: minio spec: containers: - args: - server - /data env: - name: MINIO_ACCESS_KEY value: minio - name: MINIO_SECRET_KEY value: minio123 image: minio/minio:RELEASE.2020-10-18T21-54-12Z imagePullPolicy: IfNotPresent name: minio ports: - containerPort: 9000 protocol: TCP --- apiVersion: v1 kind: Service metadata: labels: app: minio name: minio-service spec: ports: - port: 9000 protocol: TCP targetPort: 9000 selector: app: minio type: ClusterIP EOF Check minio status kubectl get pods -l app = minio Expected Output NAME READY STATUS RESTARTS AGE minio-6c949866f-452x5 1 /1 Running 0 56s Install Minio client mc \u00b6 # Run port forwarding command in a different terminal kubectl port-forward $( kubectl get pod --selector = \"app=minio\" --output jsonpath = '{.items[0].metadata.name}' ) 9000 :9000 mc config host add myminio http://127.0.0.1:9000 minio minio123 Create buckets mnist for uploading images and digits for uploading the classified image. mc mb myminio/mnist mc mb myminio/digits Setup event notification to publish events to kafka. # Setup bucket event notification with kafka mc admin config set myminio notify_kafka:1 tls_skip_verify = \"off\" queue_dir = \"\" queue_limit = \"0\" sasl = \"off\" sasl_password = \"\" sasl_username = \"\" tls_client_auth = \"0\" tls = \"off\" client_tls_cert = \"\" client_tls_key = \"\" brokers = \"kafka-headless.default.svc.cluster.local:9092\" topic = \"mnist\" version = \"\" # Restart minio mc admin service restart myminio # Setup event notification when putting images to the bucket mc event add myminio/mnist arn:minio:sqs::1:kafka -p --event put --suffix .png Upload the mnist model to Minio \u00b6 gsutil cp -r gs://kfserving-examples/models/tensorflow/mnist . mc cp -r mnist myminio/ Create S3 Secret for Minio and attach to Service Account \u00b6 KServe gets the secrets from your service account, you need to add the created or existing secret to your service account's secret list. By default KServe uses default service account, user can use own service account and overwrite on InferenceService CRD. Apply the secret and attach the secret to the service account. cat <<EOF | kubectl apply -f - apiVersion: v1 kind: Secret metadata: name: mysecret annotations: serving.kserve.io/s3-endpoint: minio-service:9000 # replace with your s3 endpoint serving.kserve.io/s3-usehttps: \"0\" # by default 1, for testing with minio you need to set to 0 type: Opaque data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= --- apiVersion: v1 kind: ServiceAccount metadata: name: default secrets: - name: mysecret --- EOF Expected Output $ secret/mysecret created $ serviceaccount/default created Build mnist transformer image \u00b6 The transformation image implements the preprocess handler to process the minio notification event to download the image from minio and transform image bytes to tensors. The postprocess handler processes the prediction and upload the image to the classified minio bucket digits . docker build -t $USERNAME /mnist-transformer:latest -f ./transformer.Dockerfile . docker push $USERNAME /mnist-transformer:latest Create the InferenceService \u00b6 Specify the built image on Transformer spec and apply the inference service CRD. New Schema Old Schema cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: mnist spec: predictor: minReplicas: 1 model: modelFormat: name: tensorflow resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi runtimeVersion: 1.14.0 storageUri: s3://mnist transformer: minReplicas: 1 containers: - image: kserve/mnist-transformer:latest name: kserve-container resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi EOF cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: mnist spec: predictor: minReplicas: 1 tensorflow: resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi runtimeVersion: 1.14.0 storageUri: s3://mnist transformer: minReplicas: 1 containers: - image: kserve/mnist-transformer:latest name: kserve-container resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi EOF This creates transformer and predictor pods, the request goes to transformer first where it invokes the preprocess handler, transformer then calls out to predictor to get the prediction response which in turn invokes the postprocess handler. kubectl get pods -l serving.kserve.io/inferenceservice = mnist Expected Output NAME READY STATUS RESTARTS AGE mnist-predictor-00001-deployment-7786656484-7zrcr 2 /2 Running 0 10s mnist-transformer-00001-deployment-68b6b695f6-4qhdj 2 /2 Running 0 5s Create kafka event source \u00b6 Apply kafka event source which creates the kafka consumer pod to pull the events from kafka and deliver to inference service. cat <<EOF | kubectl apply -f - apiVersion: sources.knative.dev/v1beta1 kind: KafkaSource metadata: name: kafka-source spec: consumerGroup: knative-group # Broker URL. Replace this with the URLs for your kafka cluster, # which is in the format of my-cluster-kafka-bootstrap.my-kafka-namespace:9092. bootstrapServers: - kafka-headless.default.svc.cluster.local:9092 topics: - mnist sink: ref: apiVersion: serving.kserve.io/v1beta1 kind: InferenceService name: mnist uri: /v1/models/mnist:predict EOF Check kafka source status kubectl get kafkasource kafka-source Expected Output NAME TOPICS BOOTSTRAPSERVERS READY REASON AGE kafka-source [ \"mnist\" ] [ \"kafka-headless.default.svc.cluster.local:9092\" ] True 81s This creates the kafka source pod which consumers the events from mnist topic Expected Output NAME READY STATUS RESTARTS AGE kafkasource-kafka-source-3d809fe2-1267-11ea-99d0-42010af00zbn5h 1 /1 Running 0 75s Upload a digit image to Minio mnist bucket \u00b6 The last step is to upload the image images/0.png , image then should be moved to the classified bucket based on the prediction response! mc cp images/0.png myminio/mnist You should expect a notification event like following sent to kafka topic mnist after uploading an image in mnist bucket Expected Output { \"EventType\" : \"s3:ObjectCreated:Put\" , \"Key\" : \"mnist/0.png\" , \"Records\" :[ { \"eventVersion\" : \"2.0\" , \"eventSource\" : \"minio:s3\" , \"awsRegion\" : \"\" , \"eventTime\" : \"2019-11-17T19:08:08Z\" , \"eventName\" : \"s3:ObjectCreated:Put\" , \"userIdentity\" :{ \"principalId\" : \"minio\" }, \"requestParameters\" :{ \"sourceIPAddress\" : \"127.0.0.1:37830\" }, \"responseElements\" :{ \"x-amz-request-id\" : \"15D808BF706E0994\" , \"x-minio-origin-endpoint\" : \"http://10.244.0.71:9000\" }, \"s3\" :{ \"s3SchemaVersion\" : \"1.0\" , \"configurationId\" : \"Config\" , \"bucket\" :{ \"name\" : \"mnist\" , \"ownerIdentity\" :{ \"principalId\" : \"minio\" }, \"arn\" : \"arn:aws:s3:::mnist\" }, \"object\" :{ \"key\" : \"0.png\" , \"size\" : 324 , \"eTag\" : \"ebed21f6f77b0a64673a3c96b0c623ba\" , \"contentType\" : \"image/png\" , \"userMetadata\" :{ \"content-type\" : \"image/png\" }, \"versionId\" : \"1\" , \"sequencer\" : \"15D808BF706E0994\" }}, \"source\" :{ \"host\" : \"\" , \"port\" : \"\" , \"userAgent\" : \"\" }} ], \"level\" : \"info\" , \"msg\" : \"\" , \"time\" : \"2019-11-17T19:08:08Z\" } Check the transformer log, you should expect a prediction response and put the image to the corresponding bucket kubectl logs mnist-transformer-00001-deployment-68b6b695f6-4qhdj -c kserve-container Expected Output 2023 -04-11 08 :48:20.811 1 root INFO [ register_model () :187 ] Registering model: mnist 2023 -04-11 08 :48:20.811 1 root INFO [ start () :129 ] Setting max asyncio worker threads as 12 2023 -04-11 08 :48:20.811 1 root INFO [ serve () :139 ] Starting uvicorn with 1 workers 2023 -04-11 08 :48:21.016 1 uvicorn.error INFO [ serve () :84 ] Started server process [ 1 ] 2023 -04-11 08 :48:21.017 1 uvicorn.error INFO [ startup () :45 ] Waiting for application startup. 2023 -04-11 08 :48:21.113 1 root INFO [ start () :62 ] Starting gRPC server on [ :: ] :8081 2023 -04-11 08 :48:21 DEBUG [ timing_asgi.middleware:40 ] ASGI scope of type lifespan is not supported yet 2023 -04-11 08 :48:21.113 1 uvicorn.error INFO [ startup () :59 ] Application startup complete. 2023 -04-11 08 :55:07.439 1 root INFO [ preprocess () :47 ] Received input { 'attributes' : { 'specversion' : '1.0' , 'id' : 'partition:0/offset:0' , 'source' : '/apis/v1/namespaces/default/kafkasources/kafka-source#mnist' , 'type' : 'dev.knative.kafka.event' , 'subject' : 'partition:0#0' , 'key' : 'mnist/0.png' , 'time' : '2023-04-11T08:55:07.439730+00:00' } , 'data' : b '{\"EventName\":\"s3:ObjectCreated:Put\",\"Key\":\"mnist/0.png\",\"Records\":[{\"eventVersion\":\"2.0\",\"eventSource\":\"minio:s3\",\"awsRegion\":\"\",\"eventTime\":\"2023-04-11T08:55:07.400Z\",\"eventName\":\"s3:ObjectCreated:Put\",\"userIdentity\":{\"principalId\":\"minio\"},\"requestParameters\":{\"accessKey\":\"minio\",\"region\":\"\",\"sourceIPAddress\":\"127.0.0.1\"},\"responseElements\":{\"content-length\":\"0\",\"x-amz-request-id\":\"1754D58029FE3874\",\"x-minio-deployment-id\":\"b8f18e35-1f04-48d4-b94b-f9704a69453f\",\"x-minio-origin-endpoint\":\"http://10.244.0.164:9000\"},\"s3\":{\"s3SchemaVersion\":\"1.0\",\"configurationId\":\"Config\",\"bucket\":{\"name\":\"mnist\",\"ownerIdentity\":{\"principalId\":\"minio\"},\"arn\":\"arn:aws:s3:::mnist\"},\"object\":{\"key\":\"0.png\",\"size\":324,\"eTag\":\"ebed21f6f77b0a64673a3c96b0c623ba\",\"contentType\":\"image/png\",\"userMetadata\":{\"content-type\":\"image/png\"},\"sequencer\":\"1754D5802CCFEDA0\"}},\"source\":{\"host\":\"127.0.0.1\",\"port\":\"\",\"userAgent\":\"MinIO (linux; amd64) minio-go/v7.0.49 mc/RELEASE.2023-03-23T20-03-04Z\"}}]}' } 2023 -04-11 08 :55:07.440 1 root INFO [ preprocess () :50 ] Event data { 'EventName' : 's3:ObjectCreated:Put' , 'Key' : 'mnist/0.png' , 'Records' : [{ 'eventVersion' : '2.0' , 'eventSource' : 'minio:s3' , 'awsRegion' : '' , 'eventTime' : '2023-04-11T08:55:07.400Z' , 'eventName' : 's3:ObjectCreated:Put' , 'userIdentity' : { 'principalId' : 'minio' } , 'requestParameters' : { 'accessKey' : 'minio' , 'region' : '' , 'sourceIPAddress' : '127.0.0.1' } , 'responseElements' : { 'content-length' : '0' , 'x-amz-request-id' : '1754D58029FE3874' , 'x-minio-deployment-id' : 'b8f18e35-1f04-48d4-b94b-f9704a69453f' , 'x-minio-origin-endpoint' : 'http://10.244.0.164:9000' } , 's3' : { 's3SchemaVersion' : '1.0' , 'configurationId' : 'Config' , 'bucket' : { 'name' : 'mnist' , 'ownerIdentity' : { 'principalId' : 'minio' } , 'arn' : 'arn:aws:s3:::mnist' } , 'object' : { 'key' : '0.png' , 'size' : 324 , 'eTag' : 'ebed21f6f77b0a64673a3c96b0c623ba' , 'contentType' : 'image/png' , 'userMetadata' : { 'content-type' : 'image/png' } , 'sequencer' : '1754D5802CCFEDA0' }} , 'source' : { 'host' : '127.0.0.1' , 'port' : '' , 'userAgent' : 'MinIO (linux; amd64) minio-go/v7.0.49 mc/RELEASE.2023-03-23T20-03-04Z' }}]} 2023 -04-11 08 :55:07.813 1 root INFO [ postprocess () :62 ] response: { 'predictions' : [{ 'predictions' : [ 0 .0247901566, 1 .37231364e-05, 0 .0202635303, 0 .39037028, 0 .000513458275, 0 .435112566, 0 .000607515569, 0 .00041125578, 0 .127784252, 0 .000133168287 ] , 'classes' : 5 }]} 2023 -04-11 08 :55:07.813 1 root INFO [ postprocess () :64 ] digit:5 2023 -04-11 08 :55:07.833 1 root INFO [ postprocess () :67 ] Image 0 .png successfully uploaded to digit-5/0.png 2023 -04-11 08 :55:07.833 1 root INFO [ __call__ () :128 ] requestId: 13ce4fc0-6723-4230-8d2f-2e6bc6f68933, preprocess_ms: 75 .415849686, explain_ms: 0 , predict_ms: 297 .461032867, postprocess_ms: 20 .473003387 2023 -04-11 08 :55:07.834 1 root INFO [ timing () :48 ] kserve.io.kserve.protocol.rest.v1_endpoints.predict 0 .39476728439331055, [ 'http_status:200' , 'http_method:POST' , 'time:wall' ] 2023 -04-11 08 :55:07.834 1 root INFO [ timing () :48 ] kserve.io.kserve.protocol.rest.v1_endpoints.predict 0 .03562399999999988, [ 'http_status:200' , 'http_method:POST' , 'time:cpu' ]","title":"Inference with Kafka event source"},{"location":"modelserving/kafka/kafka/#end-to-end-inference-service-example-with-minio-and-kafka","text":"This example shows an end to end inference pipeline which processes an kafka event and invoke the inference service to get the prediction with provided pre/post processing code. The code for this example can be found in the kafka sample folder in the KServe repository .","title":"End to end inference service example with Minio and Kafka"},{"location":"modelserving/kafka/kafka/#deploy-kafka","text":"If you do not have an existing kafka cluster, you can run the following commands to install in-cluster kafka and zookeeper using helm3 with persistence turned off. helm repo add bitnami https://charts.bitnami.com/bitnami helm install zookeeper bitnami/zookeeper --set replicaCount = 1 --set auth.enabled = false --set allowAnonymousLogin = true \\ --set persistance.enabled = false --version 11 .0.0 helm install kafka bitnami/kafka --set zookeeper.enabled = false --set replicaCount = 1 --set persistance.enabled = false \\ --set logPersistance.enabled = false --set externalZookeeper.servers = zookeeper-headless.default.svc.cluster.local \\ --version 21 .0.0 After successful install you are expected to see the running kafka cluster Expected Output NAME READY STATUS RESTARTS AGE kafka-0 1 /1 Running 0 126m zookeeper-0 1 /1 Running 0 127m","title":"Deploy Kafka"},{"location":"modelserving/kafka/kafka/#install-knative-eventing-and-kafka-event-source","text":"Install Knative Eventing Core >= 1.2 kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-crds.yaml kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-core.yaml Install Kafka Event Source . kubectl apply -f https://github.com/knative-sandbox/eventing-kafka/releases/download/knative-v1.9.1/source.yaml Install InferenceService addressable cluster role cat <<EOF | kubectl apply -f - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: inferenceservice-addressable-resolver labels: contrib.eventing.knative.dev/release: devel duck.knative.dev/addressable: \"true\" # Do not use this role directly. These rules will be added to the \"addressable-resolver\" role. rules: - apiGroups: - serving.kserve.io resources: - inferenceservices - inferenceservices/status verbs: - get - list - watch EOF","title":"Install Knative Eventing and Kafka Event Source"},{"location":"modelserving/kafka/kafka/#deploy-minio","text":"If you do not have Minio setup in your cluster, you can run following command to install Minio test instance. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: labels: app: minio name: minio spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: app: minio strategy: type: Recreate template: metadata: labels: app: minio spec: containers: - args: - server - /data env: - name: MINIO_ACCESS_KEY value: minio - name: MINIO_SECRET_KEY value: minio123 image: minio/minio:RELEASE.2020-10-18T21-54-12Z imagePullPolicy: IfNotPresent name: minio ports: - containerPort: 9000 protocol: TCP --- apiVersion: v1 kind: Service metadata: labels: app: minio name: minio-service spec: ports: - port: 9000 protocol: TCP targetPort: 9000 selector: app: minio type: ClusterIP EOF Check minio status kubectl get pods -l app = minio Expected Output NAME READY STATUS RESTARTS AGE minio-6c949866f-452x5 1 /1 Running 0 56s","title":"Deploy Minio"},{"location":"modelserving/kafka/kafka/#install-minio-client-mc","text":"# Run port forwarding command in a different terminal kubectl port-forward $( kubectl get pod --selector = \"app=minio\" --output jsonpath = '{.items[0].metadata.name}' ) 9000 :9000 mc config host add myminio http://127.0.0.1:9000 minio minio123 Create buckets mnist for uploading images and digits for uploading the classified image. mc mb myminio/mnist mc mb myminio/digits Setup event notification to publish events to kafka. # Setup bucket event notification with kafka mc admin config set myminio notify_kafka:1 tls_skip_verify = \"off\" queue_dir = \"\" queue_limit = \"0\" sasl = \"off\" sasl_password = \"\" sasl_username = \"\" tls_client_auth = \"0\" tls = \"off\" client_tls_cert = \"\" client_tls_key = \"\" brokers = \"kafka-headless.default.svc.cluster.local:9092\" topic = \"mnist\" version = \"\" # Restart minio mc admin service restart myminio # Setup event notification when putting images to the bucket mc event add myminio/mnist arn:minio:sqs::1:kafka -p --event put --suffix .png","title":"Install Minio client mc"},{"location":"modelserving/kafka/kafka/#upload-the-mnist-model-to-minio","text":"gsutil cp -r gs://kfserving-examples/models/tensorflow/mnist . mc cp -r mnist myminio/","title":"Upload the mnist model to Minio"},{"location":"modelserving/kafka/kafka/#create-s3-secret-for-minio-and-attach-to-service-account","text":"KServe gets the secrets from your service account, you need to add the created or existing secret to your service account's secret list. By default KServe uses default service account, user can use own service account and overwrite on InferenceService CRD. Apply the secret and attach the secret to the service account. cat <<EOF | kubectl apply -f - apiVersion: v1 kind: Secret metadata: name: mysecret annotations: serving.kserve.io/s3-endpoint: minio-service:9000 # replace with your s3 endpoint serving.kserve.io/s3-usehttps: \"0\" # by default 1, for testing with minio you need to set to 0 type: Opaque data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= --- apiVersion: v1 kind: ServiceAccount metadata: name: default secrets: - name: mysecret --- EOF Expected Output $ secret/mysecret created $ serviceaccount/default created","title":"Create S3 Secret for Minio and attach to Service Account"},{"location":"modelserving/kafka/kafka/#build-mnist-transformer-image","text":"The transformation image implements the preprocess handler to process the minio notification event to download the image from minio and transform image bytes to tensors. The postprocess handler processes the prediction and upload the image to the classified minio bucket digits . docker build -t $USERNAME /mnist-transformer:latest -f ./transformer.Dockerfile . docker push $USERNAME /mnist-transformer:latest","title":"Build mnist transformer image"},{"location":"modelserving/kafka/kafka/#create-the-inferenceservice","text":"Specify the built image on Transformer spec and apply the inference service CRD. New Schema Old Schema cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: mnist spec: predictor: minReplicas: 1 model: modelFormat: name: tensorflow resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi runtimeVersion: 1.14.0 storageUri: s3://mnist transformer: minReplicas: 1 containers: - image: kserve/mnist-transformer:latest name: kserve-container resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi EOF cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: mnist spec: predictor: minReplicas: 1 tensorflow: resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi runtimeVersion: 1.14.0 storageUri: s3://mnist transformer: minReplicas: 1 containers: - image: kserve/mnist-transformer:latest name: kserve-container resources: limits: cpu: 100m memory: 1Gi requests: cpu: 100m memory: 1Gi EOF This creates transformer and predictor pods, the request goes to transformer first where it invokes the preprocess handler, transformer then calls out to predictor to get the prediction response which in turn invokes the postprocess handler. kubectl get pods -l serving.kserve.io/inferenceservice = mnist Expected Output NAME READY STATUS RESTARTS AGE mnist-predictor-00001-deployment-7786656484-7zrcr 2 /2 Running 0 10s mnist-transformer-00001-deployment-68b6b695f6-4qhdj 2 /2 Running 0 5s","title":"Create the InferenceService"},{"location":"modelserving/kafka/kafka/#create-kafka-event-source","text":"Apply kafka event source which creates the kafka consumer pod to pull the events from kafka and deliver to inference service. cat <<EOF | kubectl apply -f - apiVersion: sources.knative.dev/v1beta1 kind: KafkaSource metadata: name: kafka-source spec: consumerGroup: knative-group # Broker URL. Replace this with the URLs for your kafka cluster, # which is in the format of my-cluster-kafka-bootstrap.my-kafka-namespace:9092. bootstrapServers: - kafka-headless.default.svc.cluster.local:9092 topics: - mnist sink: ref: apiVersion: serving.kserve.io/v1beta1 kind: InferenceService name: mnist uri: /v1/models/mnist:predict EOF Check kafka source status kubectl get kafkasource kafka-source Expected Output NAME TOPICS BOOTSTRAPSERVERS READY REASON AGE kafka-source [ \"mnist\" ] [ \"kafka-headless.default.svc.cluster.local:9092\" ] True 81s This creates the kafka source pod which consumers the events from mnist topic Expected Output NAME READY STATUS RESTARTS AGE kafkasource-kafka-source-3d809fe2-1267-11ea-99d0-42010af00zbn5h 1 /1 Running 0 75s","title":"Create kafka event source"},{"location":"modelserving/kafka/kafka/#upload-a-digit-image-to-minio-mnist-bucket","text":"The last step is to upload the image images/0.png , image then should be moved to the classified bucket based on the prediction response! mc cp images/0.png myminio/mnist You should expect a notification event like following sent to kafka topic mnist after uploading an image in mnist bucket Expected Output { \"EventType\" : \"s3:ObjectCreated:Put\" , \"Key\" : \"mnist/0.png\" , \"Records\" :[ { \"eventVersion\" : \"2.0\" , \"eventSource\" : \"minio:s3\" , \"awsRegion\" : \"\" , \"eventTime\" : \"2019-11-17T19:08:08Z\" , \"eventName\" : \"s3:ObjectCreated:Put\" , \"userIdentity\" :{ \"principalId\" : \"minio\" }, \"requestParameters\" :{ \"sourceIPAddress\" : \"127.0.0.1:37830\" }, \"responseElements\" :{ \"x-amz-request-id\" : \"15D808BF706E0994\" , \"x-minio-origin-endpoint\" : \"http://10.244.0.71:9000\" }, \"s3\" :{ \"s3SchemaVersion\" : \"1.0\" , \"configurationId\" : \"Config\" , \"bucket\" :{ \"name\" : \"mnist\" , \"ownerIdentity\" :{ \"principalId\" : \"minio\" }, \"arn\" : \"arn:aws:s3:::mnist\" }, \"object\" :{ \"key\" : \"0.png\" , \"size\" : 324 , \"eTag\" : \"ebed21f6f77b0a64673a3c96b0c623ba\" , \"contentType\" : \"image/png\" , \"userMetadata\" :{ \"content-type\" : \"image/png\" }, \"versionId\" : \"1\" , \"sequencer\" : \"15D808BF706E0994\" }}, \"source\" :{ \"host\" : \"\" , \"port\" : \"\" , \"userAgent\" : \"\" }} ], \"level\" : \"info\" , \"msg\" : \"\" , \"time\" : \"2019-11-17T19:08:08Z\" } Check the transformer log, you should expect a prediction response and put the image to the corresponding bucket kubectl logs mnist-transformer-00001-deployment-68b6b695f6-4qhdj -c kserve-container Expected Output 2023 -04-11 08 :48:20.811 1 root INFO [ register_model () :187 ] Registering model: mnist 2023 -04-11 08 :48:20.811 1 root INFO [ start () :129 ] Setting max asyncio worker threads as 12 2023 -04-11 08 :48:20.811 1 root INFO [ serve () :139 ] Starting uvicorn with 1 workers 2023 -04-11 08 :48:21.016 1 uvicorn.error INFO [ serve () :84 ] Started server process [ 1 ] 2023 -04-11 08 :48:21.017 1 uvicorn.error INFO [ startup () :45 ] Waiting for application startup. 2023 -04-11 08 :48:21.113 1 root INFO [ start () :62 ] Starting gRPC server on [ :: ] :8081 2023 -04-11 08 :48:21 DEBUG [ timing_asgi.middleware:40 ] ASGI scope of type lifespan is not supported yet 2023 -04-11 08 :48:21.113 1 uvicorn.error INFO [ startup () :59 ] Application startup complete. 2023 -04-11 08 :55:07.439 1 root INFO [ preprocess () :47 ] Received input { 'attributes' : { 'specversion' : '1.0' , 'id' : 'partition:0/offset:0' , 'source' : '/apis/v1/namespaces/default/kafkasources/kafka-source#mnist' , 'type' : 'dev.knative.kafka.event' , 'subject' : 'partition:0#0' , 'key' : 'mnist/0.png' , 'time' : '2023-04-11T08:55:07.439730+00:00' } , 'data' : b '{\"EventName\":\"s3:ObjectCreated:Put\",\"Key\":\"mnist/0.png\",\"Records\":[{\"eventVersion\":\"2.0\",\"eventSource\":\"minio:s3\",\"awsRegion\":\"\",\"eventTime\":\"2023-04-11T08:55:07.400Z\",\"eventName\":\"s3:ObjectCreated:Put\",\"userIdentity\":{\"principalId\":\"minio\"},\"requestParameters\":{\"accessKey\":\"minio\",\"region\":\"\",\"sourceIPAddress\":\"127.0.0.1\"},\"responseElements\":{\"content-length\":\"0\",\"x-amz-request-id\":\"1754D58029FE3874\",\"x-minio-deployment-id\":\"b8f18e35-1f04-48d4-b94b-f9704a69453f\",\"x-minio-origin-endpoint\":\"http://10.244.0.164:9000\"},\"s3\":{\"s3SchemaVersion\":\"1.0\",\"configurationId\":\"Config\",\"bucket\":{\"name\":\"mnist\",\"ownerIdentity\":{\"principalId\":\"minio\"},\"arn\":\"arn:aws:s3:::mnist\"},\"object\":{\"key\":\"0.png\",\"size\":324,\"eTag\":\"ebed21f6f77b0a64673a3c96b0c623ba\",\"contentType\":\"image/png\",\"userMetadata\":{\"content-type\":\"image/png\"},\"sequencer\":\"1754D5802CCFEDA0\"}},\"source\":{\"host\":\"127.0.0.1\",\"port\":\"\",\"userAgent\":\"MinIO (linux; amd64) minio-go/v7.0.49 mc/RELEASE.2023-03-23T20-03-04Z\"}}]}' } 2023 -04-11 08 :55:07.440 1 root INFO [ preprocess () :50 ] Event data { 'EventName' : 's3:ObjectCreated:Put' , 'Key' : 'mnist/0.png' , 'Records' : [{ 'eventVersion' : '2.0' , 'eventSource' : 'minio:s3' , 'awsRegion' : '' , 'eventTime' : '2023-04-11T08:55:07.400Z' , 'eventName' : 's3:ObjectCreated:Put' , 'userIdentity' : { 'principalId' : 'minio' } , 'requestParameters' : { 'accessKey' : 'minio' , 'region' : '' , 'sourceIPAddress' : '127.0.0.1' } , 'responseElements' : { 'content-length' : '0' , 'x-amz-request-id' : '1754D58029FE3874' , 'x-minio-deployment-id' : 'b8f18e35-1f04-48d4-b94b-f9704a69453f' , 'x-minio-origin-endpoint' : 'http://10.244.0.164:9000' } , 's3' : { 's3SchemaVersion' : '1.0' , 'configurationId' : 'Config' , 'bucket' : { 'name' : 'mnist' , 'ownerIdentity' : { 'principalId' : 'minio' } , 'arn' : 'arn:aws:s3:::mnist' } , 'object' : { 'key' : '0.png' , 'size' : 324 , 'eTag' : 'ebed21f6f77b0a64673a3c96b0c623ba' , 'contentType' : 'image/png' , 'userMetadata' : { 'content-type' : 'image/png' } , 'sequencer' : '1754D5802CCFEDA0' }} , 'source' : { 'host' : '127.0.0.1' , 'port' : '' , 'userAgent' : 'MinIO (linux; amd64) minio-go/v7.0.49 mc/RELEASE.2023-03-23T20-03-04Z' }}]} 2023 -04-11 08 :55:07.813 1 root INFO [ postprocess () :62 ] response: { 'predictions' : [{ 'predictions' : [ 0 .0247901566, 1 .37231364e-05, 0 .0202635303, 0 .39037028, 0 .000513458275, 0 .435112566, 0 .000607515569, 0 .00041125578, 0 .127784252, 0 .000133168287 ] , 'classes' : 5 }]} 2023 -04-11 08 :55:07.813 1 root INFO [ postprocess () :64 ] digit:5 2023 -04-11 08 :55:07.833 1 root INFO [ postprocess () :67 ] Image 0 .png successfully uploaded to digit-5/0.png 2023 -04-11 08 :55:07.833 1 root INFO [ __call__ () :128 ] requestId: 13ce4fc0-6723-4230-8d2f-2e6bc6f68933, preprocess_ms: 75 .415849686, explain_ms: 0 , predict_ms: 297 .461032867, postprocess_ms: 20 .473003387 2023 -04-11 08 :55:07.834 1 root INFO [ timing () :48 ] kserve.io.kserve.protocol.rest.v1_endpoints.predict 0 .39476728439331055, [ 'http_status:200' , 'http_method:POST' , 'time:wall' ] 2023 -04-11 08 :55:07.834 1 root INFO [ timing () :48 ] kserve.io.kserve.protocol.rest.v1_endpoints.predict 0 .03562399999999988, [ 'http_status:200' , 'http_method:POST' , 'time:cpu' ]","title":"Upload a digit image to Minio mnist bucket"},{"location":"modelserving/logger/logger/","text":"Inference Logger \u00b6 Basic Inference Logger \u00b6 Create Message Dumper \u00b6 Create a message dumper Knative Service which will print out the CloudEvents it receives. yaml apiVersion : serving.knative.dev/v1 kind : Service metadata : name : message-dumper spec : template : spec : containers : - image : gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display kubectl kubectl create -f message-dumper.yaml Create an InferenceService with Logger \u00b6 Create a sklearn predictor with the logger which points at the message dumper url. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : logger : mode : all url : http://message-dumper.default/ model : modelFormat : name : sklearn storageUri : gs://kfserving-examples/models/sklearn/1.0/model apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : logger : mode : all url : http://message-dumper.default/ sklearn : storageUri : gs://kfserving-examples/models/sklearn/1.0/model Note Here we set the url explicitly, otherwise it defaults to the namespace knative broker or the value of DefaultUrl in the logger section of the inference service configmap. kubectl kubectl create -f sklearn-basic-logger.yaml We can now send a request to the sklearn model. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = sklearn-iris INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output { \"predictions\" : [ 1 , 1 ] } Check CloudEvents \u00b6 Check the logs of the message dumper, we can see the CloudEvents associated with our previous curl request. kubectl kubectl logs $( kubectl get pod -l serving.knative.dev/service = message-dumper -o jsonpath = '{.items[0].metadata.name}' ) user-container Expected Output \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.request source: http://localhost:9081/ id: 0009174a-24a8-4603-b098-09c8799950e9 time: 2021 -04-10T00:23:26.0789529Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris namespace: default traceparent: 00 -90bdf848647d50283394155d2df58f19-84dacdfdf07cadfc-00 Data, { \"instances\" : [ [ 6 .8, 2 .8, 4 .8, 1 .4 ] , [ 6 .0, 3 .4, 4 .5, 1 .6 ] ] } \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.response source: http://localhost:9081/ id: 0009174a-24a8-4603-b098-09c8799950e9 time: 2021 -04-10T00:23:26.080736102Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris namespace: default traceparent: 00 -55de1514e1d23ee17eb50dda6167bb8c-b6c6e0f6dd8f741d-00 Data, { \"predictions\" : [ 1 , 1 ] } Knative Eventing Inference Logger \u00b6 A cluster running with Knative Eventing installed , along with KServe. Note This was tested using Knative Eventing v0.17. Create Message Dumper \u00b6 Create a message dumper Knative service which will print out the CloudEvents it receives. yaml apiVersion : serving.knative.dev/v1 kind : Service metadata : name : message-dumper spec : template : spec : containers : - image : gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display kubectl kubectl apply -f message-dumper.yaml Create Channel Broker \u00b6 Create a Broker which allows you route events to consumers like InferenceService. yaml apiVersion : eventing.knative.dev/v1 kind : broker metadata : name : default kubectl kubectl apply -f broker.yaml kubectl get broker default Take note of the broker URL as that is what we'll be using in the InferenceService later on. Create Trigger \u00b6 We now create a trigger to forward the events to message-dumper service. The trigger can specify a filter that enables selection of relevant events based on the Cloud Event context attributes. yaml apiVersion : eventing.knative.dev/v1 kind : Trigger metadata : name : message-dumper-trigger spec : broker : default subscriber : ref : apiVersion : serving.knative.dev/v1 kind : Service name : message-dumper kubectl kubectl create -f trigger.yaml Create an InferenceService with Logger \u00b6 Create a sklearn predictor with the logger url pointing to the Knative eventing multi-tenant broker in knative-eventing namespace. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : minReplicas : 1 logger : mode : all url : http://broker-ingress.knative-eventing.svc.cluster.local/default/default model : modelFormat : name : sklearn storageUri : gs://kfserving-examples/models/sklearn/1.0/model apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : minReplicas : 1 logger : mode : all url : http://broker-ingress.knative-eventing.svc.cluster.local/default/default sklearn : storageUri : gs://kfserving-examples/models/sklearn/1.0/model Apply the sklearn-knative-eventing.yaml . kubectl kubectl create -f sklearn-knative-eventing.yaml We can now send a request to the sklearn model. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = sklearn-iris INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output { \"predictions\" : [ 1 , 1 ] } Check CloudEvents \u00b6 Check the logs of the message dumper, we can see the CloudEvents associated with our previous curl request. kubectl kubectl logs $( kubectl get pod -l serving.knative.dev/service = message-dumper -o jsonpath = '{.items[0].metadata.name}' ) user-container Expected Output \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.request source: http://localhost:9081/ id: defb5816-35f7-4947-a2b1-b9e5d7764ad2 time: 2021 -04-10T01:22:16.498917288Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris knativearrivaltime: 2021 -04-10T01:22:16.500656431Z knativehistory: default-kne-trigger-kn-channel.default.svc.cluster.local namespace: default traceparent: 00 -16456300519c5227ffe5f784a88da2f7-2db26af1daae870c-00 Data, { \"instances\" : [ [ 6 .8, 2 .8, 4 .8, 1 .4 ] , [ 6 .0, 3 .4, 4 .5, 1 .6 ] ] } \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.response source: http://localhost:9081/ id: defb5816-35f7-4947-a2b1-b9e5d7764ad2 time: 2021 -04-10T01:22:16.500492939Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris knativearrivaltime: 2021 -04-10T01:22:16.501931207Z knativehistory: default-kne-trigger-kn-channel.default.svc.cluster.local namespace: default traceparent: 00 -2156a24451a4d4ea575fcf6c4f52a672-2b6ea035c83d3200-00 Data, { \"predictions\" : [ 1 , 1 ] }","title":"Inference Logger"},{"location":"modelserving/logger/logger/#inference-logger","text":"","title":"Inference Logger"},{"location":"modelserving/logger/logger/#basic-inference-logger","text":"","title":"Basic Inference Logger"},{"location":"modelserving/logger/logger/#create-message-dumper","text":"Create a message dumper Knative Service which will print out the CloudEvents it receives. yaml apiVersion : serving.knative.dev/v1 kind : Service metadata : name : message-dumper spec : template : spec : containers : - image : gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display kubectl kubectl create -f message-dumper.yaml","title":"Create Message Dumper"},{"location":"modelserving/logger/logger/#create-an-inferenceservice-with-logger","text":"Create a sklearn predictor with the logger which points at the message dumper url. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : logger : mode : all url : http://message-dumper.default/ model : modelFormat : name : sklearn storageUri : gs://kfserving-examples/models/sklearn/1.0/model apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : logger : mode : all url : http://message-dumper.default/ sklearn : storageUri : gs://kfserving-examples/models/sklearn/1.0/model Note Here we set the url explicitly, otherwise it defaults to the namespace knative broker or the value of DefaultUrl in the logger section of the inference service configmap. kubectl kubectl create -f sklearn-basic-logger.yaml We can now send a request to the sklearn model. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = sklearn-iris INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output { \"predictions\" : [ 1 , 1 ] }","title":"Create an InferenceService with Logger"},{"location":"modelserving/logger/logger/#check-cloudevents","text":"Check the logs of the message dumper, we can see the CloudEvents associated with our previous curl request. kubectl kubectl logs $( kubectl get pod -l serving.knative.dev/service = message-dumper -o jsonpath = '{.items[0].metadata.name}' ) user-container Expected Output \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.request source: http://localhost:9081/ id: 0009174a-24a8-4603-b098-09c8799950e9 time: 2021 -04-10T00:23:26.0789529Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris namespace: default traceparent: 00 -90bdf848647d50283394155d2df58f19-84dacdfdf07cadfc-00 Data, { \"instances\" : [ [ 6 .8, 2 .8, 4 .8, 1 .4 ] , [ 6 .0, 3 .4, 4 .5, 1 .6 ] ] } \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.response source: http://localhost:9081/ id: 0009174a-24a8-4603-b098-09c8799950e9 time: 2021 -04-10T00:23:26.080736102Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris namespace: default traceparent: 00 -55de1514e1d23ee17eb50dda6167bb8c-b6c6e0f6dd8f741d-00 Data, { \"predictions\" : [ 1 , 1 ] }","title":"Check CloudEvents"},{"location":"modelserving/logger/logger/#knative-eventing-inference-logger","text":"A cluster running with Knative Eventing installed , along with KServe. Note This was tested using Knative Eventing v0.17.","title":"Knative Eventing Inference Logger"},{"location":"modelserving/logger/logger/#create-message-dumper_1","text":"Create a message dumper Knative service which will print out the CloudEvents it receives. yaml apiVersion : serving.knative.dev/v1 kind : Service metadata : name : message-dumper spec : template : spec : containers : - image : gcr.io/knative-releases/knative.dev/eventing-contrib/cmd/event_display kubectl kubectl apply -f message-dumper.yaml","title":"Create Message Dumper"},{"location":"modelserving/logger/logger/#create-channel-broker","text":"Create a Broker which allows you route events to consumers like InferenceService. yaml apiVersion : eventing.knative.dev/v1 kind : broker metadata : name : default kubectl kubectl apply -f broker.yaml kubectl get broker default Take note of the broker URL as that is what we'll be using in the InferenceService later on.","title":"Create Channel Broker"},{"location":"modelserving/logger/logger/#create-trigger","text":"We now create a trigger to forward the events to message-dumper service. The trigger can specify a filter that enables selection of relevant events based on the Cloud Event context attributes. yaml apiVersion : eventing.knative.dev/v1 kind : Trigger metadata : name : message-dumper-trigger spec : broker : default subscriber : ref : apiVersion : serving.knative.dev/v1 kind : Service name : message-dumper kubectl kubectl create -f trigger.yaml","title":"Create Trigger"},{"location":"modelserving/logger/logger/#create-an-inferenceservice-with-logger_1","text":"Create a sklearn predictor with the logger url pointing to the Knative eventing multi-tenant broker in knative-eventing namespace. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : minReplicas : 1 logger : mode : all url : http://broker-ingress.knative-eventing.svc.cluster.local/default/default model : modelFormat : name : sklearn storageUri : gs://kfserving-examples/models/sklearn/1.0/model apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-iris spec : predictor : minReplicas : 1 logger : mode : all url : http://broker-ingress.knative-eventing.svc.cluster.local/default/default sklearn : storageUri : gs://kfserving-examples/models/sklearn/1.0/model Apply the sklearn-knative-eventing.yaml . kubectl kubectl create -f sklearn-knative-eventing.yaml We can now send a request to the sklearn model. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = sklearn-iris INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output { \"predictions\" : [ 1 , 1 ] }","title":"Create an InferenceService with Logger"},{"location":"modelserving/logger/logger/#check-cloudevents_1","text":"Check the logs of the message dumper, we can see the CloudEvents associated with our previous curl request. kubectl kubectl logs $( kubectl get pod -l serving.knative.dev/service = message-dumper -o jsonpath = '{.items[0].metadata.name}' ) user-container Expected Output \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.request source: http://localhost:9081/ id: defb5816-35f7-4947-a2b1-b9e5d7764ad2 time: 2021 -04-10T01:22:16.498917288Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris knativearrivaltime: 2021 -04-10T01:22:16.500656431Z knativehistory: default-kne-trigger-kn-channel.default.svc.cluster.local namespace: default traceparent: 00 -16456300519c5227ffe5f784a88da2f7-2db26af1daae870c-00 Data, { \"instances\" : [ [ 6 .8, 2 .8, 4 .8, 1 .4 ] , [ 6 .0, 3 .4, 4 .5, 1 .6 ] ] } \u2601\ufe0f cloudevents.Event Validation: valid Context Attributes, specversion: 1 .0 type: org.kubeflow.serving.inference.response source: http://localhost:9081/ id: defb5816-35f7-4947-a2b1-b9e5d7764ad2 time: 2021 -04-10T01:22:16.500492939Z datacontenttype: application/json Extensions, endpoint: inferenceservicename: sklearn-iris knativearrivaltime: 2021 -04-10T01:22:16.501931207Z knativehistory: default-kne-trigger-kn-channel.default.svc.cluster.local namespace: default traceparent: 00 -2156a24451a4d4ea575fcf6c4f52a672-2b6ea035c83d3200-00 Data, { \"predictions\" : [ 1 , 1 ] }","title":"Check CloudEvents"},{"location":"modelserving/mms/multi-model-serving/","text":"The model deployment scalability problem \u00b6 With machine learning approaches becoming more widely adopted in organizations, there is a trend to deploy a large number of models. For example, a news classification service may train custom models for each news category. Another important reason why organizations desire to train a lot of models is to protect data privacy, as it is safer to isolate each user's data and train models separately. While you get the benefit of better inference accuracy and data privacy by building models for each use case, it is more challenging to deploy thousands to hundreds of thousands of models on a Kubernetes cluster. Furthermore, there are an increasing number of use cases of serving neural network-based models. To achieve reasonable latency, those models are better served on GPUs. However, since GPUs are expensive resources, it is costly to serve many GPU-based models. The original design of KServe deploys one model per InferenceService. But, when dealing with a large number of models, its 'one model, one server' paradigm presents challenges for a Kubernetes cluster. To scale the number of models, we have to scale the number of InferenceServices, something that can quickly challenge the cluster's limits. Multi-model serving is designed to address three types of limitations KServe will run into: Compute resource limitation Maximum pods limitation Maximum IP address limitation. Compute resource limitation \u00b6 Each InferenceService has a resource overhead because of the sidecars injected into each pod. This normally adds about 0.5 CPU and 0.5G Memory resource per InferenceService replica. For example, if we deploy 10 models, each with 2 replicas, then the resource overhead is 10 * 2 * 0.5 = 10 CPU and 10 * 2 * 0.5 = 10 GB memory. Each model\u2019s resource overhead is 1CPU and 1 GB memory. Deploying many models using the current approach will quickly use up a cluster's computing resource. With Multi-model serving, these models can be loaded in one InferenceService, then each model's average overhead is 0.1 CPU and 0.1GB memory. For GPU based models, the number of GPUs required grows linearly as the number of models grows, which is not cost efficient. If multiple models can be loaded in one GPU enabled model server such as TritonServer, we need a lot less GPUs in the cluster. Maximum pods limitation \u00b6 Kubelet has a maximum number of pods per node with the default limit set to 110 . According to Kubernetes best practice , a node shouldn't run more than 100 pods. With this limitation, a typical 50-node cluster with default pod limit can run at most 1000 models assuming each InferenceService has 4 pods on average (two transformer replicas and two predictor replicas). Maximum IP address limitation. \u00b6 Kubernetes clusters also have an IP address limit per cluster. Each pod in InferenceService needs an independent IP. For example a cluster with 4096 IP addresses can deploy at most 1024 models assuming each InferenceService has 4 pods on average (two transformer replicas and two predictor replicas). Benefit of using ModelMesh for Multi-Model serving \u00b6 Multi-model serving with ModelMesh addresses the three limitations above. It decreases the average resource overhead per model so model deployment becomes more cost efficient. And the number of models which can be deployed in a cluster will no longer be limited by the maximum pods limitation and the maximum IP address limitation. Learn more about ModelMesh here .","title":"The Scalability Problem"},{"location":"modelserving/mms/multi-model-serving/#the-model-deployment-scalability-problem","text":"With machine learning approaches becoming more widely adopted in organizations, there is a trend to deploy a large number of models. For example, a news classification service may train custom models for each news category. Another important reason why organizations desire to train a lot of models is to protect data privacy, as it is safer to isolate each user's data and train models separately. While you get the benefit of better inference accuracy and data privacy by building models for each use case, it is more challenging to deploy thousands to hundreds of thousands of models on a Kubernetes cluster. Furthermore, there are an increasing number of use cases of serving neural network-based models. To achieve reasonable latency, those models are better served on GPUs. However, since GPUs are expensive resources, it is costly to serve many GPU-based models. The original design of KServe deploys one model per InferenceService. But, when dealing with a large number of models, its 'one model, one server' paradigm presents challenges for a Kubernetes cluster. To scale the number of models, we have to scale the number of InferenceServices, something that can quickly challenge the cluster's limits. Multi-model serving is designed to address three types of limitations KServe will run into: Compute resource limitation Maximum pods limitation Maximum IP address limitation.","title":"The model deployment scalability problem"},{"location":"modelserving/mms/multi-model-serving/#compute-resource-limitation","text":"Each InferenceService has a resource overhead because of the sidecars injected into each pod. This normally adds about 0.5 CPU and 0.5G Memory resource per InferenceService replica. For example, if we deploy 10 models, each with 2 replicas, then the resource overhead is 10 * 2 * 0.5 = 10 CPU and 10 * 2 * 0.5 = 10 GB memory. Each model\u2019s resource overhead is 1CPU and 1 GB memory. Deploying many models using the current approach will quickly use up a cluster's computing resource. With Multi-model serving, these models can be loaded in one InferenceService, then each model's average overhead is 0.1 CPU and 0.1GB memory. For GPU based models, the number of GPUs required grows linearly as the number of models grows, which is not cost efficient. If multiple models can be loaded in one GPU enabled model server such as TritonServer, we need a lot less GPUs in the cluster.","title":"Compute resource limitation"},{"location":"modelserving/mms/multi-model-serving/#maximum-pods-limitation","text":"Kubelet has a maximum number of pods per node with the default limit set to 110 . According to Kubernetes best practice , a node shouldn't run more than 100 pods. With this limitation, a typical 50-node cluster with default pod limit can run at most 1000 models assuming each InferenceService has 4 pods on average (two transformer replicas and two predictor replicas).","title":"Maximum pods limitation"},{"location":"modelserving/mms/multi-model-serving/#maximum-ip-address-limitation","text":"Kubernetes clusters also have an IP address limit per cluster. Each pod in InferenceService needs an independent IP. For example a cluster with 4096 IP addresses can deploy at most 1024 models assuming each InferenceService has 4 pods on average (two transformer replicas and two predictor replicas).","title":"Maximum IP address limitation."},{"location":"modelserving/mms/multi-model-serving/#benefit-of-using-modelmesh-for-multi-model-serving","text":"Multi-model serving with ModelMesh addresses the three limitations above. It decreases the average resource overhead per model so model deployment becomes more cost efficient. And the number of models which can be deployed in a cluster will no longer be limited by the maximum pods limitation and the maximum IP address limitation. Learn more about ModelMesh here .","title":"Benefit of using ModelMesh for Multi-Model serving"},{"location":"modelserving/mms/modelmesh/overview/","text":"ModelMesh Serving \u00b6 Multi-model serving with ModelMesh is an alpha feature added recently to increase KServe\u2019s scalability. Please assume that the interface is subject to changes. Overview \u00b6 ModelMesh Serving is a Kubernetes-based platform for realtime serving of ML/DL models, optimized for high volume/density use cases. Utilization of available system resources is maximized via intelligent management of in-memory model data across clusters of deployed Pods, based on usage of those models over time. Leveraging existing third-party model servers, a number of standard ML/DL model formats are supported out-of-the box with more to follow: TensorFlow, PyTorch ScriptModule, ONNX, scikit-learn, XGBoost, LightGBM, OpenVINO IR. It's also possible to extend with custom runtimes to support arbitrary model formats. The architecture comprises a controller Pod that orchestrates one or more Kubernetes \"model runtime\" Deployments which load/serve the models, and a Service that accepts inferencing requests. A routing layer spanning the runtime pods ensures that models are loaded in the right places at the right times and handles forwarding of those requests. The model data itself is pulled from one or more external storage instances which must be configured in a Secret. We currently support only S3-based object storage (self-managed storage is also an option for custom runtimes), but more options will be supported soon. ModelMesh Serving makes use of two core Kubernetes Custom Resource types: ServingRuntime - Templates for Pods that can serve one or more particular model formats. There are three \"built in\" runtimes that cover the out-of-the-box model types, custom runtimes can be defined by creating additional ones. Predictor - This represents a logical endpoint for serving predictions using a particular model. The Predictor spec specifies the model type, the storage in which it resides and the path to the model within that storage. The corresponding endpoint is \"stable\" and will seamlessly transition between different model versions or types when the spec is updated. The Pods that correspond to a particular ServingRuntime are started only if/when there are one or more defined Predictor s that require them. We have standardized on the KServe v2 data plane API for inferencing, this is supported for all of the built-in model types. Only the gRPC version of this API is supported in this version of ModelMesh Serving, REST support will be coming soon. Custom runtimes are free to use gRPC Service APIs for inferencing, including the KSv2 API. System-wide configuration parameters can be set by creating a ConfigMap with name model-serving-config . Components \u00b6 Core components \u00b6 ModelMesh Serving - Model serving controller ModelMesh - ModelMesh containers used for orchestrating model placement and routing Runtime Adapters \u00b6 modelmesh-runtime-adapter - the containers which run in each model serving pod and act as an intermediary between ModelMesh and third-party model-server containers. It also incorporates the \"puller\" logic that is responsible for retrieving the models from storage Model Serving Runtimes \u00b6 triton-inference-server - NVIDIA's Triton Inference Server seldon-mlserver - Python-based inference server openVINO-model-server - OpenVINO Model Server KServe integration \u00b6 Note that the integration of KServe with ModelMesh is still in an alpha stage and there are still features like explainers that do not yet work when deploying on ModelMesh. In any case, ModelMesh Serving supports deploying models using KServe's InferenceService interface . ModelMesh Serving also supports transformer use cases in which the transformers and predictors are separately deployed by KServe and ModelMesh controllers. An example of ModelMesh transformer can be found here . While ModelMesh Serving can handle both its original Predictor CRD and the KServe InferenceService CRD, there is work being done to eventually have both KServe and ModelMesh converge on the usage of InferenceService CRD. Install \u00b6 For installation instructions check out here . Learn more \u00b6 To learn more about ModelMesh, check out the documentation .","title":"ModelMesh Overview"},{"location":"modelserving/mms/modelmesh/overview/#modelmesh-serving","text":"Multi-model serving with ModelMesh is an alpha feature added recently to increase KServe\u2019s scalability. Please assume that the interface is subject to changes.","title":"ModelMesh Serving"},{"location":"modelserving/mms/modelmesh/overview/#overview","text":"ModelMesh Serving is a Kubernetes-based platform for realtime serving of ML/DL models, optimized for high volume/density use cases. Utilization of available system resources is maximized via intelligent management of in-memory model data across clusters of deployed Pods, based on usage of those models over time. Leveraging existing third-party model servers, a number of standard ML/DL model formats are supported out-of-the box with more to follow: TensorFlow, PyTorch ScriptModule, ONNX, scikit-learn, XGBoost, LightGBM, OpenVINO IR. It's also possible to extend with custom runtimes to support arbitrary model formats. The architecture comprises a controller Pod that orchestrates one or more Kubernetes \"model runtime\" Deployments which load/serve the models, and a Service that accepts inferencing requests. A routing layer spanning the runtime pods ensures that models are loaded in the right places at the right times and handles forwarding of those requests. The model data itself is pulled from one or more external storage instances which must be configured in a Secret. We currently support only S3-based object storage (self-managed storage is also an option for custom runtimes), but more options will be supported soon. ModelMesh Serving makes use of two core Kubernetes Custom Resource types: ServingRuntime - Templates for Pods that can serve one or more particular model formats. There are three \"built in\" runtimes that cover the out-of-the-box model types, custom runtimes can be defined by creating additional ones. Predictor - This represents a logical endpoint for serving predictions using a particular model. The Predictor spec specifies the model type, the storage in which it resides and the path to the model within that storage. The corresponding endpoint is \"stable\" and will seamlessly transition between different model versions or types when the spec is updated. The Pods that correspond to a particular ServingRuntime are started only if/when there are one or more defined Predictor s that require them. We have standardized on the KServe v2 data plane API for inferencing, this is supported for all of the built-in model types. Only the gRPC version of this API is supported in this version of ModelMesh Serving, REST support will be coming soon. Custom runtimes are free to use gRPC Service APIs for inferencing, including the KSv2 API. System-wide configuration parameters can be set by creating a ConfigMap with name model-serving-config .","title":"Overview"},{"location":"modelserving/mms/modelmesh/overview/#components","text":"","title":"Components"},{"location":"modelserving/mms/modelmesh/overview/#core-components","text":"ModelMesh Serving - Model serving controller ModelMesh - ModelMesh containers used for orchestrating model placement and routing","title":"Core components"},{"location":"modelserving/mms/modelmesh/overview/#runtime-adapters","text":"modelmesh-runtime-adapter - the containers which run in each model serving pod and act as an intermediary between ModelMesh and third-party model-server containers. It also incorporates the \"puller\" logic that is responsible for retrieving the models from storage","title":"Runtime Adapters"},{"location":"modelserving/mms/modelmesh/overview/#model-serving-runtimes","text":"triton-inference-server - NVIDIA's Triton Inference Server seldon-mlserver - Python-based inference server openVINO-model-server - OpenVINO Model Server","title":"Model Serving Runtimes"},{"location":"modelserving/mms/modelmesh/overview/#kserve-integration","text":"Note that the integration of KServe with ModelMesh is still in an alpha stage and there are still features like explainers that do not yet work when deploying on ModelMesh. In any case, ModelMesh Serving supports deploying models using KServe's InferenceService interface . ModelMesh Serving also supports transformer use cases in which the transformers and predictors are separately deployed by KServe and ModelMesh controllers. An example of ModelMesh transformer can be found here . While ModelMesh Serving can handle both its original Predictor CRD and the KServe InferenceService CRD, there is work being done to eventually have both KServe and ModelMesh converge on the usage of InferenceService CRD.","title":"KServe integration"},{"location":"modelserving/mms/modelmesh/overview/#install","text":"For installation instructions check out here .","title":"Install"},{"location":"modelserving/mms/modelmesh/overview/#learn-more","text":"To learn more about ModelMesh, check out the documentation .","title":"Learn more"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/","text":"InferenceService Node Scheduling \u00b6 Setup \u00b6 The InferenceService spec supports node selector, node affinity and tolerations. To enable these features we must enable the knative flags (see Install Knative Serving Note ). Option 1: Pre-Kubeflow Install Feature Flags Setup \u00b6 If we install KServe as part of Kubeflow manifest and would like to enable the feature flags before installing Kubeflow, we can do so by editing the file manifests/common/knative/knative-serving/base/upstream/serving-core.yaml This is often a common approach that allows a reproducible configuration as the feature flags will be enabled everytime we install Kubeflow. Enable kubernetes.podspec-affinity kubernetes.podspec-affinity : \"enabled\" Enable kubernetes.podspec-nodeselector kubernetes.podspec-nodeselector : \"enabled\" Enable kubernetes.podspec-tolerations kubernetes.podspec-tolerations : \"enabled\" With all features enabled we should have a data portion that looks like this : data : kubernetes.podspec-affinity : \"enabled\" kubernetes.podspec-nodeselector : \"enabled\" kubernetes.podspec-tolerations : \"enabled\" Option 2: Post-Kubeflow Install Feature Flags Setup \u00b6 If we don't want to enable the flags before installing kubeflow, we can enable the flags after installing kubeflow by editing the configuration using : kubectl edit configmap config-features -n knative-serving Simply add the flags in the data section like it was done for the pre-Kubeflow install setup. Usage \u00b6 To use node selector/node affinity and tolerations, we can use it directly in the InferenceService custom resource definition. Node Selector \u00b6 Here is an example using node selector where myLabelName can be replaced by the name of the label that the specific node we want has, same thing for myLabelValue . apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : nodeSelector : myLabelName : \"myLabelValue\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" Note that this also works on other pod spec like transformer , here is the equivalent for a transformer , we simply add it under the transformer spec : apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : transformer : nodeSelector : myLabelName : \"myLabelValue\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" GPU Node Label Selector Example \u00b6 In this example, our predictor will only run on the node with the label k8s.amazonaws.com/accelerator with the value \"nvidia-tesla-t4\" . You can learn more about recommended label names for GPU nodes when using kubernetes autoscaler by checking your cloud provider's documentation. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : nodeSelector : k8s.amazonaws.com/accelerator : \"nvidia-tesla-t4\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" resources : limits : nvidia.com/gpu : 1 requests : nvidia.com/gpu : 1 Tolerations \u00b6 This examples shows how to add a toleration to our predictor , this will make it possible (not mandatory) for the predictor pod to be scheduled on any node with the matching taint. You can replace yourTaintKeyHere with the taint key from your node taint. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : tolerations : - key : \"yourTaintKeyHere\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" This also works for other pod spec like transformer . apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : transformer : tolerations : - key : \"yourTaintKeyHere\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" Important Note On Tolerations for GPU Nodes \u00b6 It's important to use the conventional taint nvidia.com/gpu for NVIDIA GPU nodes because if we use a custom taint, the nvidia-device-plugin will not be able to be scheduled on the GPU node. Therefore our node would not be able to expose its GPUs to kubernetes making it a plain CPU only node. This would prevent us from scheduling any GPU workload on it. The nvidia-device-plugin automatically tolerates the nvidia.com/gpu taint, see this commit . Therefore by using this conventional taint, we ensure that the nvidia-device-plugin will work and allow our node to expose its GPUs. Using this taint on a GPU node also has the advantage that every pods scheduled on this GPU node will automatically have the toleration for this taint if it requests GPU resources. For instance, if we deploy an InferenceService with a predictor that requests 1 GPU, then kubernetes will detect a request of 1 GPU and add to the predictor pod the nvidia.com/gpu toleration automatically. If on the other hand, our predictor (or other pod spec like transformer ) does not request GPUs and has a node affinity/node selector for the GPU node then since the pod did not request GPUs, the toleration to nvidia.com/gpu will not be added to the pod. This is to prevent CPU only workload from preventing the GPU node to scale down for instance. Note that this feature of automatically adding toleration to pods requesting GPU resources is enabled via the ExtendedResourceToleration admission controller which was added in kubernetes 1.19. You can learn more about dedicated node pools and ExtendedResourceToleration admission controller here . Node Selector + Tolerations \u00b6 As described in the Overview we can combine node selector/node affinity and tolerations to force a pod to be scheduled on a node and to force a node to only accept pods with a matching toleration. Here is an exemple where we want our transformer to run on a node with the label myLabel1=true , we also want our transformer to tolerate nodes with the taint myTaint1 . We want our predictor to run on a node with the label myLabel2=true , we also want our predictor to tolerate nodes with the taint myTaint2 . apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : transformer : nodeSelector : myLabel1 : \"true\" tolerations : - key : \"myTaint1\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : nodeSelector : myLabel2 : \"true\" tolerations : - key : \"myTaint2\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" args : - --log-verbose=1 GPU Example \u00b6 This applies to other pod spec like transformer but if we want our predictor to run on a GPU node and if the predictor requests GPUs, then we should make sure our GPU node has the taint nvidia.com/gpu . As described earlier , this allows us to leverage kubernetes ExtendedResourceToleration and simply omit the toleration for our GPU pod given that we have a kubernetes version that supports it. The result is the same as before but we removed the toleration for the pod requesting GPUs (here the predictor ) : apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : transformer : nodeSelector : myLabel1 : \"true\" tolerations : - key : \"myTaint1\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : nodeSelector : myLabel2 : \"true\" triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" args : - --log-verbose=1 resources : limits : nvidia.com/gpu : 1 requests : nvidia.com/gpu : 1","title":"InferenceService Node Scheduling"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#inferenceservice-node-scheduling","text":"","title":"InferenceService Node Scheduling"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#setup","text":"The InferenceService spec supports node selector, node affinity and tolerations. To enable these features we must enable the knative flags (see Install Knative Serving Note ).","title":"Setup"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#option-1-pre-kubeflow-install-feature-flags-setup","text":"If we install KServe as part of Kubeflow manifest and would like to enable the feature flags before installing Kubeflow, we can do so by editing the file manifests/common/knative/knative-serving/base/upstream/serving-core.yaml This is often a common approach that allows a reproducible configuration as the feature flags will be enabled everytime we install Kubeflow. Enable kubernetes.podspec-affinity kubernetes.podspec-affinity : \"enabled\" Enable kubernetes.podspec-nodeselector kubernetes.podspec-nodeselector : \"enabled\" Enable kubernetes.podspec-tolerations kubernetes.podspec-tolerations : \"enabled\" With all features enabled we should have a data portion that looks like this : data : kubernetes.podspec-affinity : \"enabled\" kubernetes.podspec-nodeselector : \"enabled\" kubernetes.podspec-tolerations : \"enabled\"","title":"Option 1: Pre-Kubeflow Install Feature Flags Setup"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#option-2-post-kubeflow-install-feature-flags-setup","text":"If we don't want to enable the flags before installing kubeflow, we can enable the flags after installing kubeflow by editing the configuration using : kubectl edit configmap config-features -n knative-serving Simply add the flags in the data section like it was done for the pre-Kubeflow install setup.","title":"Option 2: Post-Kubeflow Install Feature Flags Setup"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#usage","text":"To use node selector/node affinity and tolerations, we can use it directly in the InferenceService custom resource definition.","title":"Usage"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#node-selector","text":"Here is an example using node selector where myLabelName can be replaced by the name of the label that the specific node we want has, same thing for myLabelValue . apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : nodeSelector : myLabelName : \"myLabelValue\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" Note that this also works on other pod spec like transformer , here is the equivalent for a transformer , we simply add it under the transformer spec : apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : transformer : nodeSelector : myLabelName : \"myLabelValue\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\"","title":"Node Selector"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#gpu-node-label-selector-example","text":"In this example, our predictor will only run on the node with the label k8s.amazonaws.com/accelerator with the value \"nvidia-tesla-t4\" . You can learn more about recommended label names for GPU nodes when using kubernetes autoscaler by checking your cloud provider's documentation. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : nodeSelector : k8s.amazonaws.com/accelerator : \"nvidia-tesla-t4\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" resources : limits : nvidia.com/gpu : 1 requests : nvidia.com/gpu : 1","title":"GPU Node Label Selector Example"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#tolerations","text":"This examples shows how to add a toleration to our predictor , this will make it possible (not mandatory) for the predictor pod to be scheduled on any node with the matching taint. You can replace yourTaintKeyHere with the taint key from your node taint. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : tolerations : - key : \"yourTaintKeyHere\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\" This also works for other pod spec like transformer . apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : transformer : tolerations : - key : \"yourTaintKeyHere\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3 env : - name : OMP_NUM_THREADS value : \"1\"","title":"Tolerations"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#important-note-on-tolerations-for-gpu-nodes","text":"It's important to use the conventional taint nvidia.com/gpu for NVIDIA GPU nodes because if we use a custom taint, the nvidia-device-plugin will not be able to be scheduled on the GPU node. Therefore our node would not be able to expose its GPUs to kubernetes making it a plain CPU only node. This would prevent us from scheduling any GPU workload on it. The nvidia-device-plugin automatically tolerates the nvidia.com/gpu taint, see this commit . Therefore by using this conventional taint, we ensure that the nvidia-device-plugin will work and allow our node to expose its GPUs. Using this taint on a GPU node also has the advantage that every pods scheduled on this GPU node will automatically have the toleration for this taint if it requests GPU resources. For instance, if we deploy an InferenceService with a predictor that requests 1 GPU, then kubernetes will detect a request of 1 GPU and add to the predictor pod the nvidia.com/gpu toleration automatically. If on the other hand, our predictor (or other pod spec like transformer ) does not request GPUs and has a node affinity/node selector for the GPU node then since the pod did not request GPUs, the toleration to nvidia.com/gpu will not be added to the pod. This is to prevent CPU only workload from preventing the GPU node to scale down for instance. Note that this feature of automatically adding toleration to pods requesting GPU resources is enabled via the ExtendedResourceToleration admission controller which was added in kubernetes 1.19. You can learn more about dedicated node pools and ExtendedResourceToleration admission controller here .","title":"Important Note On Tolerations for GPU Nodes"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#node-selector-tolerations","text":"As described in the Overview we can combine node selector/node affinity and tolerations to force a pod to be scheduled on a node and to force a node to only accept pods with a matching toleration. Here is an exemple where we want our transformer to run on a node with the label myLabel1=true , we also want our transformer to tolerate nodes with the taint myTaint1 . We want our predictor to run on a node with the label myLabel2=true , we also want our predictor to tolerate nodes with the taint myTaint2 . apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : transformer : nodeSelector : myLabel1 : \"true\" tolerations : - key : \"myTaint1\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : nodeSelector : myLabel2 : \"true\" tolerations : - key : \"myTaint2\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" args : - --log-verbose=1","title":"Node Selector + Tolerations"},{"location":"modelserving/nodescheduling/inferenceservicenodescheduling/#gpu-example","text":"This applies to other pod spec like transformer but if we want our predictor to run on a GPU node and if the predictor requests GPUs, then we should make sure our GPU node has the taint nvidia.com/gpu . As described earlier , this allows us to leverage kubernetes ExtendedResourceToleration and simply omit the toleration for our GPU pod given that we have a kubernetes version that supports it. The result is the same as before but we removed the toleration for the pod requesting GPUs (here the predictor ) : apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : transformer : nodeSelector : myLabel1 : \"true\" tolerations : - key : \"myTaint1\" operator : \"Equal\" value : \"true\" effect : \"NoSchedule\" containers : - image : kfserving/image-transformer-v2:latest name : kfserving-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 predictor : nodeSelector : myLabel2 : \"true\" triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" args : - --log-verbose=1 resources : limits : nvidia.com/gpu : 1 requests : nvidia.com/gpu : 1","title":"GPU Example"},{"location":"modelserving/nodescheduling/overview/","text":"Overview \u00b6 This docs gives an overview of how node scheduling allows to schedule a predictor or a transformer on a specific node from you cluster. Use Cases \u00b6 To help illustrate when we would use node scheduling, here are some use cases that usually require the use of node scheduling : Run a transformer on a specific node with hardware that is better for pre/post-processing (eg: CPU only node if our processing is CPU-bounded). Run a predictor on a specific node with hardware that is better for inference (eg: GPU node for heavy CNN models). Allow a cluster autoscaler to scale down a node back down to zero after the inference. For instance, if we need to run the predictor on a costly GPU or CPU node but would also like to scale down the node to zero after the inference, then this is possible. To enable such scale down, we need to use a combination of pod scale down to zero for the predictor pod and node scheduling to ensure only our predictor is scheduled on this node. As a result, once the inference is done, the predictor pod will scale down to zero and since we used node scheduling, only authorized pods were scheduled on this node therefore if only our predictor pod was allowed and now it is scaled down to zero, it means there are no more pods running on this node and this allows the autoscaler to scale down the node to zero as well. Prerequisite Knowledge \u00b6 Node Selector/Node Affinity \u00b6 In order to achieve node scheduling, KServe leverages kubernetes ability to constrain a pod to a node (see Assigning Pods to Node ). Using node selector or node affinity we can make sure a pod (eg: predictor pod), can only run on a particular node. Node selector provides a very simple way to constrain pods to nodes with particular labels. Node affinity is conceptually similar to node selector but node affinity has more advance features that are described in kubernetes documentation, feel free to use whichever one satisfies your needs. You can think of node selector/node affinity as a way to ensure a pod is only scheduled on a node with a given label for instance. Here is a diagram that represents a simple relationship when using node selector or node affinity. We can see that pod 1 has a node selector/node affinity, it only wants to run on node 1. Pod 2 does not have any node selector/node affinity, it accepts to run on any node. You can learn more about node selector and node affinity from this quick video . Taints & Tolerations \u00b6 Having node selector/node affinity is great to ensure a pod runs only on a node, but what if we don't want other pods to also run on this node? We could add pod anti-affinity to every other pods but this quickly becomes hard to maintain. This is where kubernetes taints & tolerations comes into play (see Taints and Tolerations ). This feature is leveraged by KServe and it allows us to completely isolate a node so that only pods that are \"authorized\" or more precisely, that have the required toleration can run on it. Taints \u00b6 We can use a taint on a node to specify that only pods with a toleration to this taint can run on this node. This can be illustrated as follow : Here we can see that node 1 has a taint while node 2 and node 3 have no taint. Since no pod has a toleration that matches node 1's taint, no pod can be scheduled on node 1. Tolerations \u00b6 Now if we add a toleration to pod 1 that matches the taint from node 1, then this makes it possible (not mandatory) for pod 1 to be scheduled on node 1. It does not prevent pod 1 from being scheduled on other nodes but because of the toleration to node 1's taint, it makes it possible for pod 1 to also be scheduled on node 1. Note that pod 2 is still restricted to being scheduled on node 2 and 3 since it does not have the toleration that is required to be scheduled on node 1. You can learn more about taints and tolerations from this quick video . Putting It All Together \u00b6 We can combine node selector/node affinity with taints and tolerations to force a pod to only run on a node (via node selector/node affinity) and we can force this node to only accept pods with a specific toleration (via taints & tolerations). The result is as follow : Pod 1 can only be scheduled on node 1, pod 1 has a toleration to node 1's taint and node 1 only accepts pods that have the required toleration to its taint. Pod 2 accepts to be scheduled on any node. Here node 1 only accepts pods with the required toleration therefore pod 2 cannot be scheduled on node 1. Since pod 2 does not have any node affinity, it accepts to be scheduled on any of the remaining nodes, so node 2 or node 3.","title":"Overview"},{"location":"modelserving/nodescheduling/overview/#overview","text":"This docs gives an overview of how node scheduling allows to schedule a predictor or a transformer on a specific node from you cluster.","title":"Overview"},{"location":"modelserving/nodescheduling/overview/#use-cases","text":"To help illustrate when we would use node scheduling, here are some use cases that usually require the use of node scheduling : Run a transformer on a specific node with hardware that is better for pre/post-processing (eg: CPU only node if our processing is CPU-bounded). Run a predictor on a specific node with hardware that is better for inference (eg: GPU node for heavy CNN models). Allow a cluster autoscaler to scale down a node back down to zero after the inference. For instance, if we need to run the predictor on a costly GPU or CPU node but would also like to scale down the node to zero after the inference, then this is possible. To enable such scale down, we need to use a combination of pod scale down to zero for the predictor pod and node scheduling to ensure only our predictor is scheduled on this node. As a result, once the inference is done, the predictor pod will scale down to zero and since we used node scheduling, only authorized pods were scheduled on this node therefore if only our predictor pod was allowed and now it is scaled down to zero, it means there are no more pods running on this node and this allows the autoscaler to scale down the node to zero as well.","title":"Use Cases"},{"location":"modelserving/nodescheduling/overview/#prerequisite-knowledge","text":"","title":"Prerequisite Knowledge"},{"location":"modelserving/nodescheduling/overview/#node-selectornode-affinity","text":"In order to achieve node scheduling, KServe leverages kubernetes ability to constrain a pod to a node (see Assigning Pods to Node ). Using node selector or node affinity we can make sure a pod (eg: predictor pod), can only run on a particular node. Node selector provides a very simple way to constrain pods to nodes with particular labels. Node affinity is conceptually similar to node selector but node affinity has more advance features that are described in kubernetes documentation, feel free to use whichever one satisfies your needs. You can think of node selector/node affinity as a way to ensure a pod is only scheduled on a node with a given label for instance. Here is a diagram that represents a simple relationship when using node selector or node affinity. We can see that pod 1 has a node selector/node affinity, it only wants to run on node 1. Pod 2 does not have any node selector/node affinity, it accepts to run on any node. You can learn more about node selector and node affinity from this quick video .","title":"Node Selector/Node Affinity"},{"location":"modelserving/nodescheduling/overview/#taints-tolerations","text":"Having node selector/node affinity is great to ensure a pod runs only on a node, but what if we don't want other pods to also run on this node? We could add pod anti-affinity to every other pods but this quickly becomes hard to maintain. This is where kubernetes taints & tolerations comes into play (see Taints and Tolerations ). This feature is leveraged by KServe and it allows us to completely isolate a node so that only pods that are \"authorized\" or more precisely, that have the required toleration can run on it.","title":"Taints &amp; Tolerations"},{"location":"modelserving/nodescheduling/overview/#taints","text":"We can use a taint on a node to specify that only pods with a toleration to this taint can run on this node. This can be illustrated as follow : Here we can see that node 1 has a taint while node 2 and node 3 have no taint. Since no pod has a toleration that matches node 1's taint, no pod can be scheduled on node 1.","title":"Taints"},{"location":"modelserving/nodescheduling/overview/#tolerations","text":"Now if we add a toleration to pod 1 that matches the taint from node 1, then this makes it possible (not mandatory) for pod 1 to be scheduled on node 1. It does not prevent pod 1 from being scheduled on other nodes but because of the toleration to node 1's taint, it makes it possible for pod 1 to also be scheduled on node 1. Note that pod 2 is still restricted to being scheduled on node 2 and 3 since it does not have the toleration that is required to be scheduled on node 1. You can learn more about taints and tolerations from this quick video .","title":"Tolerations"},{"location":"modelserving/nodescheduling/overview/#putting-it-all-together","text":"We can combine node selector/node affinity with taints and tolerations to force a pod to only run on a node (via node selector/node affinity) and we can force this node to only accept pods with a specific toleration (via taints & tolerations). The result is as follow : Pod 1 can only be scheduled on node 1, pod 1 has a toleration to node 1's taint and node 1 only accepts pods that have the required toleration to its taint. Pod 2 accepts to be scheduled on any node. Here node 1 only accepts pods with the required toleration therefore pod 2 cannot be scheduled on node 1. Since pod 2 does not have any node affinity, it accepts to be scheduled on any of the remaining nodes, so node 2 or node 3.","title":"Putting It All Together"},{"location":"modelserving/observability/grafana_dashboards/","text":"Grafana Dashboards \u00b6 Some example Grafana dashboards are available in GrafanaLabs. Knative HTTP Dashboard (if using serverless mode) \u00b6 The Knative HTTP Grafana dasbhoard was built from Knative's sandbox monitoring example . KServe ModelServer Latency Dashboard \u00b6 A template dashboard for KServe ModelServer Latency contains example queries using the prometheus metrics for pre/post-process, predict and explain in milliseconds. The query is a histogram quantile . A fifth graph shows the total number of requests to the predict endpoint. This graph covers all KServe's ModelServer runtimes - lgbserver, paddleserver, pmmlserver, sklearnserver, xgbserver, custom transformer/predictor. KServe TorchServe Latency Dashboard \u00b6 A template dashboard for KServe TorchServe Latency contains an inference latency graph which plots the rate of the TorchServe metric ts_inference_latency_microseconds in milliseconds. A second graph plots the rate of TorchServe's internal queue latency metric ts_queue_latency_microseconds in milliseconds. A third graph plots the total requests to the TorchServe Inference Service. For more information see the TorchServe metrics doc . KServe Triton Latency Dashboard \u00b6 A template dashboard for KServe Triton Latency contains five latency graphs with the rate of Triton's input (preprocess), infer (predict), output (postprocess), internal queue and total latency metrics plotted in milliseconds. Triton outputs metrics on GPU usage as well, and the template plots a gauge of the percentage of GPU memory usage in bytes. For more information see the Triton Inference Server docs . Debugging Performance \u00b6 With these Grafana dashboards set up, debug latency issues with the following steps First, (if in serverless mode) start with the Knative HTTP Dashboard to check if there is a queueing delay with queue-proxy compare the gateway latency percentile metrics with your target SLO check the observed concurrency metrics to see if your service is overloaded with a high number of inflight requests, indicating the service is over capacity and is unable to keep up with the number of requests check the GPU/CPU memory metrics to see if the service is close to its limits - if your service has a high number of inflight requests/high CPU/GPU usage, then a possible solution is to add more resources/replicas Next, take a look at the appropriate serving runtime dashboard to see if there is a bottleneck in the code check the latencies for pre/post-process, predict, explain - are latencies higher than expected at any one step? If so, you may need to make changes/adjustments for this step (note: TorchServe does not currently expose this level of observability at the moment, only an inference latency graph which encompasses the steps together) check the queue latency metrics (TorchServe and Triton) - if requests are stuck in the queue, the model is not able to keep up with the number of requests, consider adding more resources/replicas (Triton) check the GPU utilization metrics to see if your service is at capacity and you need more GPU resources If the numbers from the dashboards meet your SLO, check client side metrics to investigate if it is causing additional network latency.","title":"Grafana Dashboards"},{"location":"modelserving/observability/grafana_dashboards/#grafana-dashboards","text":"Some example Grafana dashboards are available in GrafanaLabs.","title":"Grafana Dashboards"},{"location":"modelserving/observability/grafana_dashboards/#knative-http-dashboard-if-using-serverless-mode","text":"The Knative HTTP Grafana dasbhoard was built from Knative's sandbox monitoring example .","title":"Knative HTTP Dashboard (if using serverless mode)"},{"location":"modelserving/observability/grafana_dashboards/#kserve-modelserver-latency-dashboard","text":"A template dashboard for KServe ModelServer Latency contains example queries using the prometheus metrics for pre/post-process, predict and explain in milliseconds. The query is a histogram quantile . A fifth graph shows the total number of requests to the predict endpoint. This graph covers all KServe's ModelServer runtimes - lgbserver, paddleserver, pmmlserver, sklearnserver, xgbserver, custom transformer/predictor.","title":"KServe ModelServer Latency Dashboard"},{"location":"modelserving/observability/grafana_dashboards/#kserve-torchserve-latency-dashboard","text":"A template dashboard for KServe TorchServe Latency contains an inference latency graph which plots the rate of the TorchServe metric ts_inference_latency_microseconds in milliseconds. A second graph plots the rate of TorchServe's internal queue latency metric ts_queue_latency_microseconds in milliseconds. A third graph plots the total requests to the TorchServe Inference Service. For more information see the TorchServe metrics doc .","title":"KServe TorchServe Latency Dashboard"},{"location":"modelserving/observability/grafana_dashboards/#kserve-triton-latency-dashboard","text":"A template dashboard for KServe Triton Latency contains five latency graphs with the rate of Triton's input (preprocess), infer (predict), output (postprocess), internal queue and total latency metrics plotted in milliseconds. Triton outputs metrics on GPU usage as well, and the template plots a gauge of the percentage of GPU memory usage in bytes. For more information see the Triton Inference Server docs .","title":"KServe Triton Latency Dashboard"},{"location":"modelserving/observability/grafana_dashboards/#debugging-performance","text":"With these Grafana dashboards set up, debug latency issues with the following steps First, (if in serverless mode) start with the Knative HTTP Dashboard to check if there is a queueing delay with queue-proxy compare the gateway latency percentile metrics with your target SLO check the observed concurrency metrics to see if your service is overloaded with a high number of inflight requests, indicating the service is over capacity and is unable to keep up with the number of requests check the GPU/CPU memory metrics to see if the service is close to its limits - if your service has a high number of inflight requests/high CPU/GPU usage, then a possible solution is to add more resources/replicas Next, take a look at the appropriate serving runtime dashboard to see if there is a bottleneck in the code check the latencies for pre/post-process, predict, explain - are latencies higher than expected at any one step? If so, you may need to make changes/adjustments for this step (note: TorchServe does not currently expose this level of observability at the moment, only an inference latency graph which encompasses the steps together) check the queue latency metrics (TorchServe and Triton) - if requests are stuck in the queue, the model is not able to keep up with the number of requests, consider adding more resources/replicas (Triton) check the GPU utilization metrics to see if your service is at capacity and you need more GPU resources If the numbers from the dashboards meet your SLO, check client side metrics to investigate if it is causing additional network latency.","title":"Debugging Performance"},{"location":"modelserving/observability/prometheus_metrics/","text":"Prometheus Metrics \u00b6 Exposing a Prometheus metrics port \u00b6 All supported serving runtimes support exporting prometheus metrics on a specified port in the inference service's pod. The appropriate port for the model server is defined in the kserve/config/runtimes YAML files. For example, torchserve defines its prometheus port as 8082 in kserve-torchserve.yaml . metadata : name : kserve-torchserve spec : annotations : prometheus.kserve.io/port : '8082' prometheus.kserve.io/path : \"/metrics\" If needed, this value can be overridden in the InferenceService YAML. To enable prometheus metrics, add the annotation serving.kserve.io/enable-prometheus-scraping to the InferenceService YAML. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-irisv2\" annotations : serving.kserve.io/enable-prometheus-scraping : \"true\" spec : predictor : sklearn : protocolVersion : v2 storageUri : \"gs://seldon-models/sklearn/iris\" The default values for serving.kserve.io/enable-prometheus-scraping can be set in the inferenceservice-config configmap. See the docs for more info. There is not currently a unified set of metrics exported by the model servers. Each model server may implement its own set of metrics to export. Note This annotation defines the prometheus port and path, but it does not trigger the prometheus to scrape. Users must configure prometheus to scrape data from inference service's pod according to the prometheus settings. Metrics for lgbserver, paddleserver, pmmlserver, sklearnserver, xgbserver, custom transformer/predictor \u00b6 Prometheus latency histograms are emitted for each of the steps (pre/postprocessing, explain, predict). Additionally, the latencies of each step are logged per request. See also modelserver prometheus label definitions and metric implementation . Metric Name Description Type request_preprocess_seconds pre-processing request latency Histogram request_explain_seconds explain request latency Histogram request_predict_seconds prediction request latency Histogram request_postprocess_seconds pre-processing request latency Histogram Other serving runtime metrics \u00b6 Some model servers define their own metrics. mlserver torchserve triton tensorflow (Please see Github Issue #2462 ) Exporting metrics \u00b6 Exporting metrics in serverless mode requires that the queue-proxy extension image is used. For more information on how to export metrics, see Queue Proxy Extension documentation. Knative/Queue-Proxy metrics \u00b6 Queue proxy emits metrics be default on port 9091. If aggregation metrics are set up with the queue proxy extension, the default port for the aggregated metrics will be 9088. See the Knative documentation (and additional metrics defined in the code ) for more information about the metrics queue-proxy exposes.","title":"Prometheus Metrics"},{"location":"modelserving/observability/prometheus_metrics/#prometheus-metrics","text":"","title":"Prometheus Metrics"},{"location":"modelserving/observability/prometheus_metrics/#exposing-a-prometheus-metrics-port","text":"All supported serving runtimes support exporting prometheus metrics on a specified port in the inference service's pod. The appropriate port for the model server is defined in the kserve/config/runtimes YAML files. For example, torchserve defines its prometheus port as 8082 in kserve-torchserve.yaml . metadata : name : kserve-torchserve spec : annotations : prometheus.kserve.io/port : '8082' prometheus.kserve.io/path : \"/metrics\" If needed, this value can be overridden in the InferenceService YAML. To enable prometheus metrics, add the annotation serving.kserve.io/enable-prometheus-scraping to the InferenceService YAML. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-irisv2\" annotations : serving.kserve.io/enable-prometheus-scraping : \"true\" spec : predictor : sklearn : protocolVersion : v2 storageUri : \"gs://seldon-models/sklearn/iris\" The default values for serving.kserve.io/enable-prometheus-scraping can be set in the inferenceservice-config configmap. See the docs for more info. There is not currently a unified set of metrics exported by the model servers. Each model server may implement its own set of metrics to export. Note This annotation defines the prometheus port and path, but it does not trigger the prometheus to scrape. Users must configure prometheus to scrape data from inference service's pod according to the prometheus settings.","title":"Exposing a Prometheus metrics port"},{"location":"modelserving/observability/prometheus_metrics/#metrics-for-lgbserver-paddleserver-pmmlserver-sklearnserver-xgbserver-custom-transformerpredictor","text":"Prometheus latency histograms are emitted for each of the steps (pre/postprocessing, explain, predict). Additionally, the latencies of each step are logged per request. See also modelserver prometheus label definitions and metric implementation . Metric Name Description Type request_preprocess_seconds pre-processing request latency Histogram request_explain_seconds explain request latency Histogram request_predict_seconds prediction request latency Histogram request_postprocess_seconds pre-processing request latency Histogram","title":"Metrics for lgbserver, paddleserver, pmmlserver, sklearnserver, xgbserver, custom transformer/predictor"},{"location":"modelserving/observability/prometheus_metrics/#other-serving-runtime-metrics","text":"Some model servers define their own metrics. mlserver torchserve triton tensorflow (Please see Github Issue #2462 )","title":"Other serving runtime metrics"},{"location":"modelserving/observability/prometheus_metrics/#exporting-metrics","text":"Exporting metrics in serverless mode requires that the queue-proxy extension image is used. For more information on how to export metrics, see Queue Proxy Extension documentation.","title":"Exporting metrics"},{"location":"modelserving/observability/prometheus_metrics/#knativequeue-proxy-metrics","text":"Queue proxy emits metrics be default on port 9091. If aggregation metrics are set up with the queue proxy extension, the default port for the aggregated metrics will be 9088. See the Knative documentation (and additional metrics defined in the code ) for more information about the metrics queue-proxy exposes.","title":"Knative/Queue-Proxy metrics"},{"location":"modelserving/storage/oci/","text":"Serving models with OCI images \u00b6 KServe's traditional approach for model initialization involves fetching models from sources like S3 buckets or URIs . This process is adequate for small models but becomes a bottleneck for larger ones like used for large language models, significantly delaying startup times in auto-scaling scenarios. \"Modelcars\" is a KServe feature designed to address these challenges. It streamlines model fetching using OCI images, offering several advantages: Reduced Startup Times: By avoiding repetitive downloads of large models, startup delays are significantly minimized. Lower Disk Space Usage: The feature decreases the need for duplicated local storage, conserving disk space. Enhanced Performance: Modelcars allows for advanced techniques like pre-fetching images and lazy-loading, improving efficiency. Compatibility and Integration: It seamlessly integrates with existing KServe infrastructure, ensuring ease of adoption. Modelcars represents a step forward in efficient model serving, particularly beneficial for handling large models and dynamic serving environments. Enabling Modelcars \u00b6 Modelcars is an experimental feature in KServe and is not enabled by default. To take advantage of this new model serving method, it needs to be activated in the KServe configuration. Follow the steps below to enable Modelcars in your environment. Note Modelcars are currently in an experimental phase. Enable this feature in a test environment first to ensure it meets your requirements before using it in a production setting. Modelcars can be enabled by modifying the storageInitializer configuration in the inferenceservice-config ConfigMap. This can be done manually using kubectl edit or by executing the script provided below, with the current namespace set to the namespace where the kserve-controller-manager is installed (depends on the way how KServer is installed.) # Script to enable Modelcars # Fetch the current storageInitializer configuration config = $( kubectl get configmap inferenceservice-config -n kserve -o jsonpath = '{.data.storageInitializer}' ) # Enable modelcars and set the UID for the containers to run (required for minikube) newValue = $( echo $config | jq -c '. + {\"enableModelcar\": true, \"uidModelcar\": 1010}' ) # Create a temporary directory for the patch file tmpdir = $( mktemp -d ) cat <<EOT > $tmpdir/patch.txt [{ \"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": '$newValue' }] EOT # Apply the patch to the ConfigMap kubectl patch configmap -n kserve inferenceservice-config --type = json --patch-file = $tmpdir /patch.txt # Restart the KServe controller to apply changes kubectl delete pod -n kserve -l control-plane = kserve-controller-manager Prepare an OCI Image with Model Data \u00b6 To utilize Modelcars for serving models, you need to prepare an OCI (Open Container Initiative) image containing your model data. This process involves creating a Dockerfile and building an OCI image that houses your model in a specific directory. Below are the steps and an example to guide you through this process. Create a Dockerfile: Start by creating a Dockerfile that uses a base image containing the necessary commands like ln (for creating symbolic links) and sleep (for keeping the container running). The Dockerfile should also include steps to create a directory /model for your model data and copy the data into this directory. Here's an example Dockerfile where the data/ directory contains your model data. This data will later be mounted in /mnt/models by the runtime: FROM busybox RUN mkdir /models && chmod 775 /models COPY data/ /models/ Build and Push the Image to a Registry : Once your Dockerfile is ready, use either docker or podman to build and push the image to a container registry like Docker Hub or quay.io docker build -t myuser/mymodel:1.0 . docker push myuser/mymodel:1.0 By completing these steps, you'll have an OCI image ready with your model data, which can then be used with the Modelcars feature in KServe for efficient model serving. Using Modelcars \u00b6 With Modelcars enabled and your OCI image containing the model data prepared, integrating this setup into your InferenceService is straightforward. The key step involves specifying the storageURI with the oci:// schema in your InferenceService configuration to point to your OCI image. Here\u2019s an example of how an InferenceService configuration would look when using the Modelcars feature: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : my-inference-service spec : predictor : model : modelFormat : name : sklearn storageUri : oci://myuser/mymodel:1.0 In order to fully leverage the local caching capabilities of OCI images in the Modelcars setup, it is crucial to use a specific tag for your model image, rather than relying on the default latest tag. For instance, in the provided example, the tag 1.0 is utilized. This approach ensures that the modelcar image is pulled with a IfNotPresent policy, allowing for efficient use of local cache. On the other hand, using the latest tag, or omitting a tag altogether, defaults to a Always pull policy. This means the image would be re-downloaded every time a Pod restarts or scales up, negating the benefits of local caching and potentially leading to increased startup times. Example \u00b6 Let's see how modecars work by deploying the getting started example by using an OCI image and check how it is different to the startup with a storage-initalizer init-container. Asuming you have setup a namespace kserve-test that is KServe enabled, create an InferenceService that uses an oci:// storage URL: kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris-oci\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"oci://rhuss/kserving-example-sklearn:1.0\" EOF After the InferenceService has been deployed successfully, you can follow the steps of the getting started example to verify the installation. Finally, let's have a brief look under the covers for how this feature works. Let's first check the runtime pod: kubectl get pods Sample Output NAME READY STATUS RESTARTS AGE sklearn-iris-oci-predictor-00001-deployment-58fc6564d7 3 /3 Running 1 ( 39m ago ) 40m As you can see, the Pod has now one additional container. This container is running the modelcar image and runs a ln -s /proc/$$/root/models /mnt/ command to create a symbolic link on a shared empty volume that is mounted on /mnt in the modelcar container and the serving runtime container. The magic here is the symbolic link over proc filesystem, which is shared among all containers. This is possible on Kubernetes for the container's of a Pod if the field .spec.shareProcessNamespace is set to true , which is the case for all storageUri that leverages the oci:// schema. Let's jump into the runtime container and examine the mounted /mnt filesystem: # InferenceService Pod pod = $( kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris-oci -o name ) # Verify that shareProcessNamespace is enabled kubectl get $pod -o jsonpath = \"{.spec.shareProcessNamespace}\" # Jump into pod and check the model location kubectl exec -it $pod -c kserve-container -- bash Sample in-container session sklearn-iris-oci-predictor:/$ cd /mnt sklearn-iris-oci-predictor:/mnt$ ls -l total 0 lrwxrwxrwx 1 1010 root 20 Jan 27 10 :35 models -> /proc/38/root/models sklearn-iris-oci-predictor:/mnt$ cd /mnt/models sklearn-iris-oci-predictor:/mnt/models$ ls -l total 8 -rw-r--r-- 1 root root 5408 Jan 26 15 :58 model.joblib As you can see, the runtime can directly access the data coming from the modelcar image, without prior copying it over in another volume. Configuration \u00b6 Fine-tuning the behavior of Modelcars in KServe is possible through global configuration settings for inference services. These settings are located in the inferenceservice-config ConfigMap, which resides in the kserve namespace or the namespace where the KServe controller operates. This ConfigMap includes various subconfigurations, with the Modelcars configuration located under the storageInitializer entry. To view the current configuration, use the following command: kubectl get cm -n kserve inferenceservice-config --jsonpath \"{.data.storageInitializer}\" Sample Output { \"image\" : \"kserve/storage-initializer:v0.11.2\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"enableDirectPvcVolumeMount\" : false , \"enableModelcar\" : true , \"uidModelcar\" : 1010 } The output is a JSON string representing the configuration. For Modelcars, several keys are available for customization: Key Description Example enableModelcar Enables direct access to an OCI container image using a source URL with an \"oci://\" schema. true cpuModelcar CPU request and limit for the modelcar container. 10m memoryModelcar Memory request and limit for the modelcar container. 15Mi uidModelcar UID under which the modelcar process and the main container run. Set to 0 for root if needed. If not set, the UID of the containers is used. 1042 References \u00b6 Modelcar Design document Original GitHub issue (discusses also some alternative solutions) 12-minute demo Code walkthrough showing the implementation of Modelcars in KServe (for background information)","title":"OCI"},{"location":"modelserving/storage/oci/#serving-models-with-oci-images","text":"KServe's traditional approach for model initialization involves fetching models from sources like S3 buckets or URIs . This process is adequate for small models but becomes a bottleneck for larger ones like used for large language models, significantly delaying startup times in auto-scaling scenarios. \"Modelcars\" is a KServe feature designed to address these challenges. It streamlines model fetching using OCI images, offering several advantages: Reduced Startup Times: By avoiding repetitive downloads of large models, startup delays are significantly minimized. Lower Disk Space Usage: The feature decreases the need for duplicated local storage, conserving disk space. Enhanced Performance: Modelcars allows for advanced techniques like pre-fetching images and lazy-loading, improving efficiency. Compatibility and Integration: It seamlessly integrates with existing KServe infrastructure, ensuring ease of adoption. Modelcars represents a step forward in efficient model serving, particularly beneficial for handling large models and dynamic serving environments.","title":"Serving models with OCI images"},{"location":"modelserving/storage/oci/#enabling-modelcars","text":"Modelcars is an experimental feature in KServe and is not enabled by default. To take advantage of this new model serving method, it needs to be activated in the KServe configuration. Follow the steps below to enable Modelcars in your environment. Note Modelcars are currently in an experimental phase. Enable this feature in a test environment first to ensure it meets your requirements before using it in a production setting. Modelcars can be enabled by modifying the storageInitializer configuration in the inferenceservice-config ConfigMap. This can be done manually using kubectl edit or by executing the script provided below, with the current namespace set to the namespace where the kserve-controller-manager is installed (depends on the way how KServer is installed.) # Script to enable Modelcars # Fetch the current storageInitializer configuration config = $( kubectl get configmap inferenceservice-config -n kserve -o jsonpath = '{.data.storageInitializer}' ) # Enable modelcars and set the UID for the containers to run (required for minikube) newValue = $( echo $config | jq -c '. + {\"enableModelcar\": true, \"uidModelcar\": 1010}' ) # Create a temporary directory for the patch file tmpdir = $( mktemp -d ) cat <<EOT > $tmpdir/patch.txt [{ \"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": '$newValue' }] EOT # Apply the patch to the ConfigMap kubectl patch configmap -n kserve inferenceservice-config --type = json --patch-file = $tmpdir /patch.txt # Restart the KServe controller to apply changes kubectl delete pod -n kserve -l control-plane = kserve-controller-manager","title":"Enabling Modelcars"},{"location":"modelserving/storage/oci/#prepare-an-oci-image-with-model-data","text":"To utilize Modelcars for serving models, you need to prepare an OCI (Open Container Initiative) image containing your model data. This process involves creating a Dockerfile and building an OCI image that houses your model in a specific directory. Below are the steps and an example to guide you through this process. Create a Dockerfile: Start by creating a Dockerfile that uses a base image containing the necessary commands like ln (for creating symbolic links) and sleep (for keeping the container running). The Dockerfile should also include steps to create a directory /model for your model data and copy the data into this directory. Here's an example Dockerfile where the data/ directory contains your model data. This data will later be mounted in /mnt/models by the runtime: FROM busybox RUN mkdir /models && chmod 775 /models COPY data/ /models/ Build and Push the Image to a Registry : Once your Dockerfile is ready, use either docker or podman to build and push the image to a container registry like Docker Hub or quay.io docker build -t myuser/mymodel:1.0 . docker push myuser/mymodel:1.0 By completing these steps, you'll have an OCI image ready with your model data, which can then be used with the Modelcars feature in KServe for efficient model serving.","title":"Prepare an OCI Image with Model Data"},{"location":"modelserving/storage/oci/#using-modelcars","text":"With Modelcars enabled and your OCI image containing the model data prepared, integrating this setup into your InferenceService is straightforward. The key step involves specifying the storageURI with the oci:// schema in your InferenceService configuration to point to your OCI image. Here\u2019s an example of how an InferenceService configuration would look when using the Modelcars feature: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : my-inference-service spec : predictor : model : modelFormat : name : sklearn storageUri : oci://myuser/mymodel:1.0 In order to fully leverage the local caching capabilities of OCI images in the Modelcars setup, it is crucial to use a specific tag for your model image, rather than relying on the default latest tag. For instance, in the provided example, the tag 1.0 is utilized. This approach ensures that the modelcar image is pulled with a IfNotPresent policy, allowing for efficient use of local cache. On the other hand, using the latest tag, or omitting a tag altogether, defaults to a Always pull policy. This means the image would be re-downloaded every time a Pod restarts or scales up, negating the benefits of local caching and potentially leading to increased startup times.","title":"Using Modelcars"},{"location":"modelserving/storage/oci/#example","text":"Let's see how modecars work by deploying the getting started example by using an OCI image and check how it is different to the startup with a storage-initalizer init-container. Asuming you have setup a namespace kserve-test that is KServe enabled, create an InferenceService that uses an oci:// storage URL: kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris-oci\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"oci://rhuss/kserving-example-sklearn:1.0\" EOF After the InferenceService has been deployed successfully, you can follow the steps of the getting started example to verify the installation. Finally, let's have a brief look under the covers for how this feature works. Let's first check the runtime pod: kubectl get pods Sample Output NAME READY STATUS RESTARTS AGE sklearn-iris-oci-predictor-00001-deployment-58fc6564d7 3 /3 Running 1 ( 39m ago ) 40m As you can see, the Pod has now one additional container. This container is running the modelcar image and runs a ln -s /proc/$$/root/models /mnt/ command to create a symbolic link on a shared empty volume that is mounted on /mnt in the modelcar container and the serving runtime container. The magic here is the symbolic link over proc filesystem, which is shared among all containers. This is possible on Kubernetes for the container's of a Pod if the field .spec.shareProcessNamespace is set to true , which is the case for all storageUri that leverages the oci:// schema. Let's jump into the runtime container and examine the mounted /mnt filesystem: # InferenceService Pod pod = $( kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris-oci -o name ) # Verify that shareProcessNamespace is enabled kubectl get $pod -o jsonpath = \"{.spec.shareProcessNamespace}\" # Jump into pod and check the model location kubectl exec -it $pod -c kserve-container -- bash Sample in-container session sklearn-iris-oci-predictor:/$ cd /mnt sklearn-iris-oci-predictor:/mnt$ ls -l total 0 lrwxrwxrwx 1 1010 root 20 Jan 27 10 :35 models -> /proc/38/root/models sklearn-iris-oci-predictor:/mnt$ cd /mnt/models sklearn-iris-oci-predictor:/mnt/models$ ls -l total 8 -rw-r--r-- 1 root root 5408 Jan 26 15 :58 model.joblib As you can see, the runtime can directly access the data coming from the modelcar image, without prior copying it over in another volume.","title":"Example"},{"location":"modelserving/storage/oci/#configuration","text":"Fine-tuning the behavior of Modelcars in KServe is possible through global configuration settings for inference services. These settings are located in the inferenceservice-config ConfigMap, which resides in the kserve namespace or the namespace where the KServe controller operates. This ConfigMap includes various subconfigurations, with the Modelcars configuration located under the storageInitializer entry. To view the current configuration, use the following command: kubectl get cm -n kserve inferenceservice-config --jsonpath \"{.data.storageInitializer}\" Sample Output { \"image\" : \"kserve/storage-initializer:v0.11.2\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"enableDirectPvcVolumeMount\" : false , \"enableModelcar\" : true , \"uidModelcar\" : 1010 } The output is a JSON string representing the configuration. For Modelcars, several keys are available for customization: Key Description Example enableModelcar Enables direct access to an OCI container image using a source URL with an \"oci://\" schema. true cpuModelcar CPU request and limit for the modelcar container. 10m memoryModelcar Memory request and limit for the modelcar container. 15Mi uidModelcar UID under which the modelcar process and the main container run. Set to 0 for root if needed. If not set, the UID of the containers is used. 1042","title":"Configuration"},{"location":"modelserving/storage/oci/#references","text":"Modelcar Design document Original GitHub issue (discusses also some alternative solutions) 12-minute demo Code walkthrough showing the implementation of Modelcars in KServe (for background information)","title":"References"},{"location":"modelserving/storage/storagecontainers/","text":"Storage Containers \u00b6 KServe downloads models using a storage initializer (initContainer). For example, this is the default storage initializer implementation . KServe introduced ClusterStorageContainer CRD in 0.11 which allows users to specify a custom container spec for a list of supported URI formats. A ClusterStorageContainer defines the container spec for one or more storage URI formats. Here is an example of a ClusterStorageContainer that corresponds to the default storage initializer. Note that this is incluced in the helm chart . apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/storage-initializer:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : gs:// - prefix : s3:// - prefix : hdfs:// - prefix : webhdfs:// - regex : \"https://(.+?).blob.core.windows.net/(.+)\" - regex : \"https://(.+?).file.core.windows.net/(.+)\" - regex : \"https?://(.+)/(.+)\" In a ClusterStorageContainer spec, you can specify container resource requests and limits, and a list of supported URI formats that this image supports. KServe can match the URI either with prefix or regex . Warning If a storage URI is supported by two or more ClusterStorageContainer CRs, there is no guarantee which one will be used. Please make sure that the URI format is only supported by one ClusterStorageContainer CR . Custom Protocol Example \u00b6 If you would like to use a custom protocol model-registry:// , for example, you can create a custom image and add a new ClusterStorageContainer CR to make it available to KServe. Create the Custom Storage Initializer Image \u00b6 The first step is to create a custom container image that will be injected into the KServe deployment, as init container, and that will be in charge to download the model. The only requirement is that the Entrypoint of this container image should take (and properly manage) 2 positional arguments: 1. Source URI : identifies the storageUri set in the InferenceService 2. Destination Path : the location where the model should be stored, e.g., /mnt/models Note KServe controller will take care of properly injecting your container image and invoking it with those proper arguments. A more concrete example can be found here , where the storage initializer query an existing model registry service in order to retrieve the original location of the model that the user requested to deploy. Create the ClusterStorageContainer CR \u00b6 Once the Custom Storage Initializer image is ready, you just need to create a new ClusterStorageContainer CR to make it available in the cluster. You just need to provide 2 essential information: 1. The container spec definition , this is strictly dependent on your own custom storage initializer image. 2. The supported uri formats for which your custom storage initializer should be injected, in this case just model-registry:// . kubectl kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1alpha1\" kind: ClusterStorageContainer metadata: name: abc spec: container: name: storage-initializer image: kubeflow/model-registry-storage-initializer:latest env: - name: MODEL_REGISTRY_BASE_URL value: \"$MODEL_REGISTRY_SERVICE.model-registry.svc.cluster.local:$MODEL_REGISTRY_REST_PORT\" - name: MODEL_REGISTRY_SCHEME value: \"http\" resources: requests: memory: 100Mi cpu: 100m limits: memory: 1Gi cpu: \"1\" supportedUriFormats: - prefix: model-registry:// EOF Deploy the Model with InferenceService \u00b6 Create the InferenceService with the model-registry specific URI format. kubectl kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"iris-model\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"model-registry://iris/v1\" EOF Note The only assumption here is that the ML model you are going to deploy has been already registered in the Model Registry, more information can be found in the kubeflow/model-registry repository. In this specific example the model-registry://iris/v1 model is referring to a registered model pointing to gs://kfserving-examples/models/sklearn/1.0/model . The crucial point here is that this information needs to be provided just during the registration process, whereas during every deployment action you just need to provide the model-registry specific URI that identifies that model (in this case model-registry://${MODEL_NAME}/${MODEL_VERSION} ). Spec Attributes \u00b6 Spec attributes are in API Reference doc.","title":"Storage Containers"},{"location":"modelserving/storage/storagecontainers/#storage-containers","text":"KServe downloads models using a storage initializer (initContainer). For example, this is the default storage initializer implementation . KServe introduced ClusterStorageContainer CRD in 0.11 which allows users to specify a custom container spec for a list of supported URI formats. A ClusterStorageContainer defines the container spec for one or more storage URI formats. Here is an example of a ClusterStorageContainer that corresponds to the default storage initializer. Note that this is incluced in the helm chart . apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/storage-initializer:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : gs:// - prefix : s3:// - prefix : hdfs:// - prefix : webhdfs:// - regex : \"https://(.+?).blob.core.windows.net/(.+)\" - regex : \"https://(.+?).file.core.windows.net/(.+)\" - regex : \"https?://(.+)/(.+)\" In a ClusterStorageContainer spec, you can specify container resource requests and limits, and a list of supported URI formats that this image supports. KServe can match the URI either with prefix or regex . Warning If a storage URI is supported by two or more ClusterStorageContainer CRs, there is no guarantee which one will be used. Please make sure that the URI format is only supported by one ClusterStorageContainer CR .","title":"Storage Containers"},{"location":"modelserving/storage/storagecontainers/#custom-protocol-example","text":"If you would like to use a custom protocol model-registry:// , for example, you can create a custom image and add a new ClusterStorageContainer CR to make it available to KServe.","title":"Custom Protocol Example"},{"location":"modelserving/storage/storagecontainers/#create-the-custom-storage-initializer-image","text":"The first step is to create a custom container image that will be injected into the KServe deployment, as init container, and that will be in charge to download the model. The only requirement is that the Entrypoint of this container image should take (and properly manage) 2 positional arguments: 1. Source URI : identifies the storageUri set in the InferenceService 2. Destination Path : the location where the model should be stored, e.g., /mnt/models Note KServe controller will take care of properly injecting your container image and invoking it with those proper arguments. A more concrete example can be found here , where the storage initializer query an existing model registry service in order to retrieve the original location of the model that the user requested to deploy.","title":"Create the Custom Storage Initializer Image"},{"location":"modelserving/storage/storagecontainers/#create-the-clusterstoragecontainer-cr","text":"Once the Custom Storage Initializer image is ready, you just need to create a new ClusterStorageContainer CR to make it available in the cluster. You just need to provide 2 essential information: 1. The container spec definition , this is strictly dependent on your own custom storage initializer image. 2. The supported uri formats for which your custom storage initializer should be injected, in this case just model-registry:// . kubectl kubectl apply -f - <<EOF apiVersion: \"serving.kserve.io/v1alpha1\" kind: ClusterStorageContainer metadata: name: abc spec: container: name: storage-initializer image: kubeflow/model-registry-storage-initializer:latest env: - name: MODEL_REGISTRY_BASE_URL value: \"$MODEL_REGISTRY_SERVICE.model-registry.svc.cluster.local:$MODEL_REGISTRY_REST_PORT\" - name: MODEL_REGISTRY_SCHEME value: \"http\" resources: requests: memory: 100Mi cpu: 100m limits: memory: 1Gi cpu: \"1\" supportedUriFormats: - prefix: model-registry:// EOF","title":"Create the ClusterStorageContainer CR"},{"location":"modelserving/storage/storagecontainers/#deploy-the-model-with-inferenceservice","text":"Create the InferenceService with the model-registry specific URI format. kubectl kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"iris-model\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"model-registry://iris/v1\" EOF Note The only assumption here is that the ML model you are going to deploy has been already registered in the Model Registry, more information can be found in the kubeflow/model-registry repository. In this specific example the model-registry://iris/v1 model is referring to a registered model pointing to gs://kfserving-examples/models/sklearn/1.0/model . The crucial point here is that this information needs to be provided just during the registration process, whereas during every deployment action you just need to provide the model-registry specific URI that identifies that model (in this case model-registry://${MODEL_NAME}/${MODEL_VERSION} ).","title":"Deploy the Model with InferenceService"},{"location":"modelserving/storage/storagecontainers/#spec-attributes","text":"Spec attributes are in API Reference doc.","title":"Spec Attributes"},{"location":"modelserving/storage/azure/azure/","text":"Deploy InferenceService with saved model on Azure \u00b6 Using Public Azure Blobs \u00b6 By default, KServe uses anonymous client to download artifacts. To point to an Azure Blob, specify StorageUri to point to an Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} e.g. https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib Using Private Blobs \u00b6 KServe supports authenticating using an Azure Service Principle. Create an authorized Azure Service Principle \u00b6 To create an Azure Service Principle follow the steps here . Assign the SP the Storage Blob Data Owner role on your blob (KServe needs this permission as it needs to list contents at the blob path to filter items to download). Details on assigning storage roles here . az ad sp create-for-rbac --name model-store-sp --role \"Storage Blob Data Owner\" \\ --scopes /subscriptions/2662a931-80ae-46f4-adc7-869c1f2bcabf/resourceGroups/cognitive/providers/Microsoft.Storage/storageAccounts/modelstoreaccount Create Azure Secret and attach to Service Account \u00b6 Create Azure secret \u00b6 yaml apiVersion : v1 kind : Secret metadata : name : azcreds type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AZ_CLIENT_ID : xxxxx AZ_CLIENT_SECRET : xxxxx AZ_SUBSCRIPTION_ID : xxxxx AZ_TENANT_ID : xxxxx Attach secret to a service account \u00b6 yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : azcreds kubectl kubectl apply -f create-azure-secret.yaml Deploy the model on Azure with InferenceService \u00b6 Create the InferenceService with the azure storageUri and the service account with azure credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa model : modelFormat : name : sklearn storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa sklearn : storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" Apply the sklearn-azure.yaml . kubectl kubectl apply -f sklearn-azure.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-azure -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-azure INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-azure:predict HTTP/1.1 > Host: sklearn-azure.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Azure"},{"location":"modelserving/storage/azure/azure/#deploy-inferenceservice-with-saved-model-on-azure","text":"","title":"Deploy InferenceService with saved model on Azure"},{"location":"modelserving/storage/azure/azure/#using-public-azure-blobs","text":"By default, KServe uses anonymous client to download artifacts. To point to an Azure Blob, specify StorageUri to point to an Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} e.g. https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib","title":"Using Public Azure Blobs"},{"location":"modelserving/storage/azure/azure/#using-private-blobs","text":"KServe supports authenticating using an Azure Service Principle.","title":"Using Private Blobs"},{"location":"modelserving/storage/azure/azure/#create-an-authorized-azure-service-principle","text":"To create an Azure Service Principle follow the steps here . Assign the SP the Storage Blob Data Owner role on your blob (KServe needs this permission as it needs to list contents at the blob path to filter items to download). Details on assigning storage roles here . az ad sp create-for-rbac --name model-store-sp --role \"Storage Blob Data Owner\" \\ --scopes /subscriptions/2662a931-80ae-46f4-adc7-869c1f2bcabf/resourceGroups/cognitive/providers/Microsoft.Storage/storageAccounts/modelstoreaccount","title":"Create an authorized Azure Service Principle"},{"location":"modelserving/storage/azure/azure/#create-azure-secret-and-attach-to-service-account","text":"","title":"Create Azure Secret and attach to Service Account"},{"location":"modelserving/storage/azure/azure/#create-azure-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : azcreds type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AZ_CLIENT_ID : xxxxx AZ_CLIENT_SECRET : xxxxx AZ_SUBSCRIPTION_ID : xxxxx AZ_TENANT_ID : xxxxx","title":"Create Azure secret"},{"location":"modelserving/storage/azure/azure/#attach-secret-to-a-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : azcreds kubectl kubectl apply -f create-azure-secret.yaml","title":"Attach secret to a service account"},{"location":"modelserving/storage/azure/azure/#deploy-the-model-on-azure-with-inferenceservice","text":"Create the InferenceService with the azure storageUri and the service account with azure credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa model : modelFormat : name : sklearn storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa sklearn : storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" Apply the sklearn-azure.yaml . kubectl kubectl apply -f sklearn-azure.yaml","title":"Deploy the model on Azure with InferenceService"},{"location":"modelserving/storage/azure/azure/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-azure -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-azure INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-azure:predict HTTP/1.1 > Host: sklearn-azure.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/gcs/gcs/","text":"Deploy InferenceService with a saved model on Google Cloud Storage (GCS) \u00b6 Using Public GCS Bucket \u00b6 If no credential is provided, anonymous client will be used to download the artifact from GCS bucket. The uri is in the following format: gs://${BUCKET_ NAME}/${PATH} e.g. gs://kfserving-examples/models/tensorflow/flowers Using Private GCS bucket \u00b6 KServe supports authenticating using Google Service Account Key Create a Service Account Key \u00b6 To create a Service Account Key follow the steps here . Base64 encode the generated Service Account Key file Create Google Secret \u00b6 Create secret \u00b6 yaml apiVersion : v1 kind : Secret metadata : name : storage-config type : Opaque stringData : gcs : | { \"type\": \"gs\", \"bucket\": \"mlpipeline\", \"base64_service_account\": \"c2VydmljZWFjY291bnQ=\" # base64 encoded value of the credential file } kubectl kubectl apply -f create-gcs-secret.yaml Deploy the model on GCS with InferenceService \u00b6 Create the InferenceService with the Google service account credential yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-gcs spec : predictor : model : modelFormat : name : sklearn storage : key : gcs path : models/tensorflow/flowers parameters : # Parameters to override the default values bucket : kfserving-examples Apply the sklearn-gcs.yaml . kubectl kubectl apply -f sklearn-gcs.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-gcs -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-gcs INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-gcs:predict HTTP/1.1 > Host: sklearn-gcs.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"GCS"},{"location":"modelserving/storage/gcs/gcs/#deploy-inferenceservice-with-a-saved-model-on-google-cloud-storage-gcs","text":"","title":"Deploy InferenceService with a saved model on Google Cloud Storage (GCS)"},{"location":"modelserving/storage/gcs/gcs/#using-public-gcs-bucket","text":"If no credential is provided, anonymous client will be used to download the artifact from GCS bucket. The uri is in the following format: gs://${BUCKET_ NAME}/${PATH} e.g. gs://kfserving-examples/models/tensorflow/flowers","title":"Using Public GCS Bucket"},{"location":"modelserving/storage/gcs/gcs/#using-private-gcs-bucket","text":"KServe supports authenticating using Google Service Account Key","title":"Using Private GCS bucket"},{"location":"modelserving/storage/gcs/gcs/#create-a-service-account-key","text":"To create a Service Account Key follow the steps here . Base64 encode the generated Service Account Key file","title":"Create a Service Account Key"},{"location":"modelserving/storage/gcs/gcs/#create-google-secret","text":"","title":"Create Google Secret"},{"location":"modelserving/storage/gcs/gcs/#create-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : storage-config type : Opaque stringData : gcs : | { \"type\": \"gs\", \"bucket\": \"mlpipeline\", \"base64_service_account\": \"c2VydmljZWFjY291bnQ=\" # base64 encoded value of the credential file } kubectl kubectl apply -f create-gcs-secret.yaml","title":"Create secret"},{"location":"modelserving/storage/gcs/gcs/#deploy-the-model-on-gcs-with-inferenceservice","text":"Create the InferenceService with the Google service account credential yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-gcs spec : predictor : model : modelFormat : name : sklearn storage : key : gcs path : models/tensorflow/flowers parameters : # Parameters to override the default values bucket : kfserving-examples Apply the sklearn-gcs.yaml . kubectl kubectl apply -f sklearn-gcs.yaml","title":"Deploy the model on GCS with InferenceService"},{"location":"modelserving/storage/gcs/gcs/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-gcs -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-gcs INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-gcs:predict HTTP/1.1 > Host: sklearn-gcs.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/pvc/pvc/","text":"Deploy InferenceService with a saved model on PVC \u00b6 This doc shows how to store a model in PVC and create InferenceService with a saved model on PVC. Create PV and PVC \u00b6 Refer to the document to create Persistent Volume (PV) and Persistent Volume Claim (PVC), the PVC will be used to store model. This document uses local PV. yaml apiVersion : v1 kind : PersistentVolume metadata : name : task-pv-volume labels : type : local spec : storageClassName : manual capacity : storage : 2Gi accessModes : - ReadWriteOnce hostPath : path : \"/home/ubuntu/mnt/data\" --- apiVersion : v1 kind : PersistentVolumeClaim metadata : name : task-pv-claim spec : storageClassName : manual accessModes : - ReadWriteOnce resources : requests : storage : 1Gi kubectl kubectl apply -f pv-and-pvc.yaml Copy model to PV \u00b6 Run pod model-store-pod and login into container model-store . yaml apiVersion : v1 kind : Pod metadata : name : model-store-pod spec : volumes : - name : model-store persistentVolumeClaim : claimName : task-pv-claim containers : - name : model-store image : ubuntu command : [ \"sleep\" ] args : [ \"infinity\" ] volumeMounts : - mountPath : \"/pv\" name : model-store resources : limits : memory : \"1Gi\" cpu : \"1\" kubectl kubectl apply -f pv-model-store.yaml kubectl exec -it model-store-pod -- bash In different terminal, copy the model from local into PV, then delete model-store-pod . kubectl kubectl cp model.joblib model-store-pod:/pv/model.joblib -c model-store kubectl delete pod model-store-pod Deploy InferenceService with models on PVC \u00b6 Update the ${PVC_NAME} to the created PVC name and create the InferenceService with the PVC storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"pvc://${PVC_NAME}/${MODEL_NAME}/\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : sklearn : storageUri : \"pvc://${PVC_NAME}/${MODEL_NAME}/\" Apply the sklearn-pvc.yaml . kubectl kubectl apply -f sklearn-pvc.yaml Note that inside the folder ${PVC_NAME}/${MODEL_NAME}/ you should have your model model.joblib . Note also that ${MODEL_NAME} is just a folder, but a good convention to keep the same name. Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-pvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-pvc INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-pvc:predict HTTP/1.1 > Host: sklearn-pvc.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"PVC"},{"location":"modelserving/storage/pvc/pvc/#deploy-inferenceservice-with-a-saved-model-on-pvc","text":"This doc shows how to store a model in PVC and create InferenceService with a saved model on PVC.","title":"Deploy InferenceService with a saved model on PVC"},{"location":"modelserving/storage/pvc/pvc/#create-pv-and-pvc","text":"Refer to the document to create Persistent Volume (PV) and Persistent Volume Claim (PVC), the PVC will be used to store model. This document uses local PV. yaml apiVersion : v1 kind : PersistentVolume metadata : name : task-pv-volume labels : type : local spec : storageClassName : manual capacity : storage : 2Gi accessModes : - ReadWriteOnce hostPath : path : \"/home/ubuntu/mnt/data\" --- apiVersion : v1 kind : PersistentVolumeClaim metadata : name : task-pv-claim spec : storageClassName : manual accessModes : - ReadWriteOnce resources : requests : storage : 1Gi kubectl kubectl apply -f pv-and-pvc.yaml","title":"Create PV and PVC"},{"location":"modelserving/storage/pvc/pvc/#copy-model-to-pv","text":"Run pod model-store-pod and login into container model-store . yaml apiVersion : v1 kind : Pod metadata : name : model-store-pod spec : volumes : - name : model-store persistentVolumeClaim : claimName : task-pv-claim containers : - name : model-store image : ubuntu command : [ \"sleep\" ] args : [ \"infinity\" ] volumeMounts : - mountPath : \"/pv\" name : model-store resources : limits : memory : \"1Gi\" cpu : \"1\" kubectl kubectl apply -f pv-model-store.yaml kubectl exec -it model-store-pod -- bash In different terminal, copy the model from local into PV, then delete model-store-pod . kubectl kubectl cp model.joblib model-store-pod:/pv/model.joblib -c model-store kubectl delete pod model-store-pod","title":"Copy model to PV"},{"location":"modelserving/storage/pvc/pvc/#deploy-inferenceservice-with-models-on-pvc","text":"Update the ${PVC_NAME} to the created PVC name and create the InferenceService with the PVC storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"pvc://${PVC_NAME}/${MODEL_NAME}/\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : sklearn : storageUri : \"pvc://${PVC_NAME}/${MODEL_NAME}/\" Apply the sklearn-pvc.yaml . kubectl kubectl apply -f sklearn-pvc.yaml Note that inside the folder ${PVC_NAME}/${MODEL_NAME}/ you should have your model model.joblib . Note also that ${MODEL_NAME} is just a folder, but a good convention to keep the same name.","title":"Deploy InferenceService with models on PVC"},{"location":"modelserving/storage/pvc/pvc/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-pvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-pvc INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-pvc:predict HTTP/1.1 > Host: sklearn-pvc.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/s3/s3/","text":"Deploy InferenceService with a saved model on S3 \u00b6 There are two supported methods for configuring credentials for AWS S3 storage: AWS IAM Role for Service Account (Recommended) AWS IAM User Credentials Global configuration options for S3 credentials can be found in the inferenceservice configmap, and will be used as a backup if the relevant annotations aren't found on the secret or service account. Create Service Account with IAM Role \u00b6 Create an IAM Role and configure according to the AWS Documentation . KServe will read the annotations on the Service Acccount in order to inject the proper environment variables on the storage initializer container. Create Service Account \u00b6 yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa annotations : eks.amazonaws.com/role-arn : arn:aws:iam::123456789012:role/s3access # replace with your IAM role ARN serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials kubectl kubectl apply -f create-s3-sa.yaml Create S3 Secret and attach to Service Account \u00b6 Create a secret with your S3 user credential , KServe reads the secret annotations to inject the S3 environment variables on storage initializer or model agent to download the models from S3 storage. Create S3 secret \u00b6 yaml apiVersion : v1 kind : Secret metadata : name : s3creds annotations : serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AWS_ACCESS_KEY_ID : XXXX AWS_SECRET_ACCESS_KEY : XXXXXXXX Attach secret to a service account \u00b6 yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : s3creds kubectl kubectl apply -f create-s3-secret.yaml Note If you are running kserve with istio sidecars enabled, there can be a race condition between the istio proxy being ready and the agent pulling models. This will result in a tcp dial connection refused error when the agent tries to download from s3. To resolve it, istio allows the blocking of other containers in a pod until the proxy container is ready. You can enabled this by setting proxy.holdApplicationUntilProxyStarts: true in istio-sidecar-injector configmap, proxy.holdApplicationUntilProxyStarts flag was introduced in Istio 1.7 as an experimental feature and is turned off by default. Deploy the model on S3 with InferenceService \u00b6 Create the InferenceService with the s3 storageUri and the service account with s3 credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa model : modelFormat : name : tensorflow storageUri : \"s3://kserve-examples/mnist\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa tensorflow : storageUri : \"s3://kserve-examples/mnist\" Apply the autoscale-gpu.yaml . kubectl kubectl apply -f mnist-s3.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice mnist-s3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = mnist-s3 INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output Note: Unnecessary use of -X or --request, POST is already inferred. * Trying 35 .237.217.209... * TCP_NODELAY set * Connected to mnist-s3.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/mnist-s3:predict HTTP/1.1 > Host: mnist-s3.default.35.237.217.209.xip.io > User-Agent: curl/7.55.1 > Accept: */* > Content-Length: 2052 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 251 < content-type: application/json < date: Sun, 04 Apr 2021 20 :06:27 GMT < x-envoy-upstream-service-time: 5 < server: istio-envoy < * Connection #0 to host mnist-s3.default.35.237.217.209.xip.io left intact { \"predictions\" : [ { \"predictions\" : [ 0 .327352405, 2 .00153053e-07, 0 .0113353515, 0 .203903764, 3 .62863029e-05, 0 .416683704, 0 .000281196437, 8 .36911859e-05, 0 .0403052084, 1 .82206513e-05 ] , \"classes\" : 5 } ] }","title":"S3"},{"location":"modelserving/storage/s3/s3/#deploy-inferenceservice-with-a-saved-model-on-s3","text":"There are two supported methods for configuring credentials for AWS S3 storage: AWS IAM Role for Service Account (Recommended) AWS IAM User Credentials Global configuration options for S3 credentials can be found in the inferenceservice configmap, and will be used as a backup if the relevant annotations aren't found on the secret or service account.","title":"Deploy InferenceService with a saved model on S3"},{"location":"modelserving/storage/s3/s3/#create-service-account-with-iam-role","text":"Create an IAM Role and configure according to the AWS Documentation . KServe will read the annotations on the Service Acccount in order to inject the proper environment variables on the storage initializer container.","title":"Create Service Account with IAM Role"},{"location":"modelserving/storage/s3/s3/#create-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa annotations : eks.amazonaws.com/role-arn : arn:aws:iam::123456789012:role/s3access # replace with your IAM role ARN serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials kubectl kubectl apply -f create-s3-sa.yaml","title":"Create Service Account"},{"location":"modelserving/storage/s3/s3/#create-s3-secret-and-attach-to-service-account","text":"Create a secret with your S3 user credential , KServe reads the secret annotations to inject the S3 environment variables on storage initializer or model agent to download the models from S3 storage.","title":"Create S3 Secret and attach to Service Account"},{"location":"modelserving/storage/s3/s3/#create-s3-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : s3creds annotations : serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AWS_ACCESS_KEY_ID : XXXX AWS_SECRET_ACCESS_KEY : XXXXXXXX","title":"Create S3 secret"},{"location":"modelserving/storage/s3/s3/#attach-secret-to-a-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : s3creds kubectl kubectl apply -f create-s3-secret.yaml Note If you are running kserve with istio sidecars enabled, there can be a race condition between the istio proxy being ready and the agent pulling models. This will result in a tcp dial connection refused error when the agent tries to download from s3. To resolve it, istio allows the blocking of other containers in a pod until the proxy container is ready. You can enabled this by setting proxy.holdApplicationUntilProxyStarts: true in istio-sidecar-injector configmap, proxy.holdApplicationUntilProxyStarts flag was introduced in Istio 1.7 as an experimental feature and is turned off by default.","title":"Attach secret to a service account"},{"location":"modelserving/storage/s3/s3/#deploy-the-model-on-s3-with-inferenceservice","text":"Create the InferenceService with the s3 storageUri and the service account with s3 credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa model : modelFormat : name : tensorflow storageUri : \"s3://kserve-examples/mnist\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa tensorflow : storageUri : \"s3://kserve-examples/mnist\" Apply the autoscale-gpu.yaml . kubectl kubectl apply -f mnist-s3.yaml","title":"Deploy the model on S3 with InferenceService"},{"location":"modelserving/storage/s3/s3/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice mnist-s3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = mnist-s3 INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output Note: Unnecessary use of -X or --request, POST is already inferred. * Trying 35 .237.217.209... * TCP_NODELAY set * Connected to mnist-s3.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/mnist-s3:predict HTTP/1.1 > Host: mnist-s3.default.35.237.217.209.xip.io > User-Agent: curl/7.55.1 > Accept: */* > Content-Length: 2052 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 251 < content-type: application/json < date: Sun, 04 Apr 2021 20 :06:27 GMT < x-envoy-upstream-service-time: 5 < server: istio-envoy < * Connection #0 to host mnist-s3.default.35.237.217.209.xip.io left intact { \"predictions\" : [ { \"predictions\" : [ 0 .327352405, 2 .00153053e-07, 0 .0113353515, 0 .203903764, 3 .62863029e-05, 0 .416683704, 0 .000281196437, 8 .36911859e-05, 0 .0403052084, 1 .82206513e-05 ] , \"classes\" : 5 } ] }","title":"Run a prediction"},{"location":"modelserving/storage/uri/uri/","text":"Deploy InferenceService with a saved model from a URI \u00b6 This doc guides to specify a model object via the URI (Uniform Resource Identifier) of the model object exposed via an http or https endpoint. This storageUri option supports single file models, like sklearn which is specified by a joblib file, or artifacts (e.g. tar or zip ) which contain all the necessary dependencies for other model types (e.g. tensorflow or pytorch ). Here, we'll show examples from both of the above. Create HTTP/HTTPS header Secret and attach to Service account \u00b6 The HTTP/HTTPS service request headers can be defined as secret and attached to service account. This is optional. yaml apiVersion : v1 kind : Secret metadata : name : mysecret type : Opaque data : https-host : ZXhhbXBsZS5jb20= headers : |- ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 --- apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : mysecret kubectl kubectl apply -f create-uri-secret.yaml Note The serviceAccountName specified in your predictor in your inference service. These headers will be applied to any http/https requests that have the same host. The header and host should be base64 encoded format. example.com # echo -n \"example.com\" | base64 ZXhhbXBsZS5jb20= --- { \"account-name\": \"some_account_name\", \"secret-key\": \"some_secret_key\" } # echo -n '{\\n\"account-name\": \"some_account_name\",\\n\"secret-key\": \"some_secret_key\"\\n}' | base64 ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 Sklearn \u00b6 Train and freeze the model \u00b6 Here, we'll train a simple iris model. Please note that KServe requires sklearn==0.20.3 . python from sklearn import svm from sklearn import datasets import joblib def train ( X , y ): clf = svm . SVC ( gamma = 'auto' ) clf . fit ( X , y ) return clf def freeze ( clf , path = '../frozen' ): joblib . dump ( clf , f ' { path } /model.joblib' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , y = iris . data , iris . target clf = train ( X , y ) freeze ( clf ) Now, the frozen model object can be put it somewhere on the web to expose it. For instance, pushing the model.joblib file to some repo on GitHub. Specify and create the InferenceService \u00b6 New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : sklearn : storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true Apply the sklearn-from-uri.yaml . kubectl kubectl apply -f sklearn-from-uri.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-from-uri -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-from-uri INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-from-uri:predict HTTP/1.1 > Host: sklearn-from-uri.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 06 Sep 2021 15 :52:55 GMT < server: istio-envoy < x-envoy-upstream-service-time: 7 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]} Tensorflow \u00b6 This will serve as an example of the ability to also pull in a tarball containing all of the required model dependencies, for instance tensorflow requires multiple files in a strict directory structure in order to be servable. Train and freeze the model \u00b6 python from sklearn import datasets import numpy as np import tensorflow as tf def _ohe ( targets ): y = np . zeros (( 150 , 3 )) for i , label in enumerate ( targets ): y [ i , label ] = 1.0 return y def train ( X , y , epochs , batch_size = 16 ): model = tf . keras . Sequential ([ tf . keras . layers . InputLayer ( input_shape = ( 4 ,)), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 3 , activation = 'softmax' ) ]) model . compile ( tf . keras . optimizers . RMSprop ( learning_rate = 0.001 ), loss = 'categorical_crossentropy' , metrics = [ 'accuracy' ]) model . fit ( X , y , epochs = epochs ) return model def freeze ( model , path = '../frozen' ): model . save ( f ' { path } /0001' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , targets = iris . data , iris . target y = _ohe ( targets ) model = train ( X , y , epochs = 50 ) freeze ( model ) The post-training procedure here is a bit different. Instead of directly pushing the frozen output to some URI, we'll need to package them into a tarball. To do so, cd ../frozen tar -cvf artifacts.tar 0001 / gzip < artifacts.tar > artifacts.tgz Where we assume the 0001/ directory has the structure: |-- 0001/ |-- saved_model.pb |-- variables/ |--- variables.data-00000-of-00001 |--- variables.index Note Building the tarball from the directory specifying a version number is required for tensorflow . Specify and create the InferenceService \u00b6 And again, if everything went to plan we should be able to pull down the tarball and expose the endpoint. yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : tensorflow-from-uri-gzip spec : predictor : tensorflow : storageUri : https://raw.githubusercontent.com/tduffy000/kfserving-uri-examples/master/tensorflow/frozen/model_artifacts.tar.gz kubectl kubectl apply -f tensorflow-from-uri-gzip.yaml Run a prediction \u00b6 Again, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice tensorflow-from-uri-gzip -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = tensorflow-from-uri-gzip INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 10 .0.1.16... * TCP_NODELAY set % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 * Connected to 10 .0.1.16 ( 10 .0.1.16 ) port 30749 ( #0) > POST /v1/models/tensorflow-from-uri:predict HTTP/1.1 > Host: tensorflow-from-uri.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 86 > Content-Type: application/x-www-form-urlencoded > } [ 86 bytes data ] * upload completely sent off: 86 out of 86 bytes < HTTP/1.1 200 OK < content-length: 112 < content-type: application/json < date: Thu, 06 Aug 2020 23 :21:19 GMT < x-envoy-upstream-service-time: 151 < server: istio-envoy < { [ 112 bytes data ] 100 198 100 112 100 86 722 554 --:--:-- --:--:-- --:--:-- 1285 * Connection #0 to host 10.0.1.16 left intact { \"predictions\" : [ [ 0 .0204100646, 0 .680984616, 0 .298605353 ] , [ 0 .0296604875, 0 .658412039, 0 .311927497 ] ] }","title":"URI"},{"location":"modelserving/storage/uri/uri/#deploy-inferenceservice-with-a-saved-model-from-a-uri","text":"This doc guides to specify a model object via the URI (Uniform Resource Identifier) of the model object exposed via an http or https endpoint. This storageUri option supports single file models, like sklearn which is specified by a joblib file, or artifacts (e.g. tar or zip ) which contain all the necessary dependencies for other model types (e.g. tensorflow or pytorch ). Here, we'll show examples from both of the above.","title":"Deploy InferenceService with a saved model from a URI"},{"location":"modelserving/storage/uri/uri/#create-httphttps-header-secret-and-attach-to-service-account","text":"The HTTP/HTTPS service request headers can be defined as secret and attached to service account. This is optional. yaml apiVersion : v1 kind : Secret metadata : name : mysecret type : Opaque data : https-host : ZXhhbXBsZS5jb20= headers : |- ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 --- apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : mysecret kubectl kubectl apply -f create-uri-secret.yaml Note The serviceAccountName specified in your predictor in your inference service. These headers will be applied to any http/https requests that have the same host. The header and host should be base64 encoded format. example.com # echo -n \"example.com\" | base64 ZXhhbXBsZS5jb20= --- { \"account-name\": \"some_account_name\", \"secret-key\": \"some_secret_key\" } # echo -n '{\\n\"account-name\": \"some_account_name\",\\n\"secret-key\": \"some_secret_key\"\\n}' | base64 ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9","title":"Create HTTP/HTTPS header Secret and attach to Service account"},{"location":"modelserving/storage/uri/uri/#sklearn","text":"","title":"Sklearn"},{"location":"modelserving/storage/uri/uri/#train-and-freeze-the-model","text":"Here, we'll train a simple iris model. Please note that KServe requires sklearn==0.20.3 . python from sklearn import svm from sklearn import datasets import joblib def train ( X , y ): clf = svm . SVC ( gamma = 'auto' ) clf . fit ( X , y ) return clf def freeze ( clf , path = '../frozen' ): joblib . dump ( clf , f ' { path } /model.joblib' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , y = iris . data , iris . target clf = train ( X , y ) freeze ( clf ) Now, the frozen model object can be put it somewhere on the web to expose it. For instance, pushing the model.joblib file to some repo on GitHub.","title":"Train and freeze the model"},{"location":"modelserving/storage/uri/uri/#specify-and-create-the-inferenceservice","text":"New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : sklearn : storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true Apply the sklearn-from-uri.yaml . kubectl kubectl apply -f sklearn-from-uri.yaml","title":"Specify and create the InferenceService"},{"location":"modelserving/storage/uri/uri/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-from-uri -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-from-uri INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-from-uri:predict HTTP/1.1 > Host: sklearn-from-uri.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 06 Sep 2021 15 :52:55 GMT < server: istio-envoy < x-envoy-upstream-service-time: 7 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/uri/uri/#tensorflow","text":"This will serve as an example of the ability to also pull in a tarball containing all of the required model dependencies, for instance tensorflow requires multiple files in a strict directory structure in order to be servable.","title":"Tensorflow"},{"location":"modelserving/storage/uri/uri/#train-and-freeze-the-model_1","text":"python from sklearn import datasets import numpy as np import tensorflow as tf def _ohe ( targets ): y = np . zeros (( 150 , 3 )) for i , label in enumerate ( targets ): y [ i , label ] = 1.0 return y def train ( X , y , epochs , batch_size = 16 ): model = tf . keras . Sequential ([ tf . keras . layers . InputLayer ( input_shape = ( 4 ,)), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 3 , activation = 'softmax' ) ]) model . compile ( tf . keras . optimizers . RMSprop ( learning_rate = 0.001 ), loss = 'categorical_crossentropy' , metrics = [ 'accuracy' ]) model . fit ( X , y , epochs = epochs ) return model def freeze ( model , path = '../frozen' ): model . save ( f ' { path } /0001' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , targets = iris . data , iris . target y = _ohe ( targets ) model = train ( X , y , epochs = 50 ) freeze ( model ) The post-training procedure here is a bit different. Instead of directly pushing the frozen output to some URI, we'll need to package them into a tarball. To do so, cd ../frozen tar -cvf artifacts.tar 0001 / gzip < artifacts.tar > artifacts.tgz Where we assume the 0001/ directory has the structure: |-- 0001/ |-- saved_model.pb |-- variables/ |--- variables.data-00000-of-00001 |--- variables.index Note Building the tarball from the directory specifying a version number is required for tensorflow .","title":"Train and freeze the model"},{"location":"modelserving/storage/uri/uri/#specify-and-create-the-inferenceservice_1","text":"And again, if everything went to plan we should be able to pull down the tarball and expose the endpoint. yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : tensorflow-from-uri-gzip spec : predictor : tensorflow : storageUri : https://raw.githubusercontent.com/tduffy000/kfserving-uri-examples/master/tensorflow/frozen/model_artifacts.tar.gz kubectl kubectl apply -f tensorflow-from-uri-gzip.yaml","title":"Specify and create the InferenceService"},{"location":"modelserving/storage/uri/uri/#run-a-prediction_1","text":"Again, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice tensorflow-from-uri-gzip -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = tensorflow-from-uri-gzip INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 10 .0.1.16... * TCP_NODELAY set % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 * Connected to 10 .0.1.16 ( 10 .0.1.16 ) port 30749 ( #0) > POST /v1/models/tensorflow-from-uri:predict HTTP/1.1 > Host: tensorflow-from-uri.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 86 > Content-Type: application/x-www-form-urlencoded > } [ 86 bytes data ] * upload completely sent off: 86 out of 86 bytes < HTTP/1.1 200 OK < content-length: 112 < content-type: application/json < date: Thu, 06 Aug 2020 23 :21:19 GMT < x-envoy-upstream-service-time: 151 < server: istio-envoy < { [ 112 bytes data ] 100 198 100 112 100 86 722 554 --:--:-- --:--:-- --:--:-- 1285 * Connection #0 to host 10.0.1.16 left intact { \"predictions\" : [ [ 0 .0204100646, 0 .680984616, 0 .298605353 ] , [ 0 .0296604875, 0 .658412039, 0 .311927497 ] ] }","title":"Run a prediction"},{"location":"modelserving/v1beta1/serving_runtime/","text":"Model Serving Runtimes \u00b6 KServe provides a simple Kubernetes CRD to enable deploying single or multiple trained models onto model serving runtimes such as TFServing , TorchServe , Triton Inference Server . In addition ModelServer is the Python model serving runtime implemented in KServe itself with prediction v1 protocol, MLServer implements the prediction v2 protocol with both REST and gRPC. These model serving runtimes are able to provide out-of-the-box model serving, but you could also choose to build your own model server for more complex use case. KServe provides basic API primitives to allow you easily build custom model serving runtime, you can use other tools like BentoML to build your custom model serving image. After models are deployed with InferenceService, you get all the following serverless features provided by KServe. Scale to and from Zero Request based Autoscaling on CPU/GPU Revision Management Optimized Container Batching Request/Response logging Traffic management Security with AuthN/AuthZ Distributed Tracing Out-of-the-box metrics Ingress/Egress control The table below identifies each of the model serving runtimes supported by KServe. The HTTP and gRPC columns indicate the prediction protocol version that the serving runtime supports. The KServe prediction protocol is noted as either \"v1\" or \"v2\". Some serving runtimes also support their own prediction protocol, these are noted with an * . The default serving runtime version column defines the source and version of the serving runtime - MLServer, KServe or its own. These versions can also be found in the runtime kustomization YAML . All KServe native model serving runtimes use the current KServe release version (v0.12). The supported framework version column lists the major version of the model that is supported. These can also be found in the respective runtime YAML under the supportedModelFormats field. For model frameworks using the KServe serving runtime, the specific default version can be found in kserve/python . In a given serving runtime directory the pyproject.toml file contains the exact model framework version used. For example, in kserve/python/lgbserver the pyproject.toml file sets the model framework version to 3.3.2, lightgbm ~= 3.3.2 . Model Serving Runtime Exported model HTTP gRPC Default Serving Runtime Version Supported Framework (Major) Version(s) Examples Custom ModelServer -- v1, v2 v2 -- -- Custom Model LightGBM MLServer Saved LightGBM Model v2 v2 v1.3.2 (MLServer) 3 LightGBM Iris V2 LightGBM ModelServer Saved LightGBM Model v1, v2 v2 v0.12 (KServe) 3 LightGBM Iris MLFlow ModelServer Saved MLFlow Model v2 v2 v1.3.2 (MLServer) 1 MLFLow wine-classifier PMML ModelServer PMML v1, v2 v2 v0.12 (KServe) 3, 4 ( PMML4.4.1 ) SKLearn PMML SKLearn MLServer Pickled Model v2 v2 v1.3.2 (MLServer) 1 SKLearn Iris V2 SKLearn ModelServer Pickled Model v1, v2 v2 v0.12 (KServe) 1.3 SKLearn Iris TFServing TensorFlow SavedModel v1 *tensorflow 2.6.2 ( TFServing Versions ) 2 TensorFlow flower TorchServe Eager Model/TorchScript v1, v2, *torchserve *torchserve 0.8.2 (TorchServe) 2 TorchServe mnist Triton Inference Server TensorFlow,TorchScript,ONNX v2 v2 23.05-py3 (Triton) 8 (TensoRT), 1, 2 (TensorFlow), 2 (PyTorch), 2 (Triton) Compatibility Matrix Torchscript cifar XGBoost MLServer Saved Model v2 v2 v1.3.2 (MLServer) 1 XGBoost Iris V2 XGBoost ModelServer Saved Model v1, v2 v2 v0.12 (KServe) 1 XGBoost Iris HuggingFace ModelServer Saved Model / Huggingface Hub Model_Id v1, v2 -- v0.12 (KServe) 4 ( Transformers ) -- HuggingFace VLLM ModelServer Saved Model / Huggingface Hub Model_Id v2 -- v0.12 (KServe) 0 ( Vllm ) -- *tensorflow - Tensorflow implements its own prediction protocol in addition to KServe's. See: Tensorflow Serving Prediction API documentation *torchserve - PyTorch implements its own prediction protocol in addition to KServe's. See: Torchserve gRPC API documentation Note The model serving runtime version can be overwritten with the runtimeVersion field on InferenceService yaml and we highly recommend setting this field for production services. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3","title":"Overview"},{"location":"modelserving/v1beta1/serving_runtime/#model-serving-runtimes","text":"KServe provides a simple Kubernetes CRD to enable deploying single or multiple trained models onto model serving runtimes such as TFServing , TorchServe , Triton Inference Server . In addition ModelServer is the Python model serving runtime implemented in KServe itself with prediction v1 protocol, MLServer implements the prediction v2 protocol with both REST and gRPC. These model serving runtimes are able to provide out-of-the-box model serving, but you could also choose to build your own model server for more complex use case. KServe provides basic API primitives to allow you easily build custom model serving runtime, you can use other tools like BentoML to build your custom model serving image. After models are deployed with InferenceService, you get all the following serverless features provided by KServe. Scale to and from Zero Request based Autoscaling on CPU/GPU Revision Management Optimized Container Batching Request/Response logging Traffic management Security with AuthN/AuthZ Distributed Tracing Out-of-the-box metrics Ingress/Egress control The table below identifies each of the model serving runtimes supported by KServe. The HTTP and gRPC columns indicate the prediction protocol version that the serving runtime supports. The KServe prediction protocol is noted as either \"v1\" or \"v2\". Some serving runtimes also support their own prediction protocol, these are noted with an * . The default serving runtime version column defines the source and version of the serving runtime - MLServer, KServe or its own. These versions can also be found in the runtime kustomization YAML . All KServe native model serving runtimes use the current KServe release version (v0.12). The supported framework version column lists the major version of the model that is supported. These can also be found in the respective runtime YAML under the supportedModelFormats field. For model frameworks using the KServe serving runtime, the specific default version can be found in kserve/python . In a given serving runtime directory the pyproject.toml file contains the exact model framework version used. For example, in kserve/python/lgbserver the pyproject.toml file sets the model framework version to 3.3.2, lightgbm ~= 3.3.2 . Model Serving Runtime Exported model HTTP gRPC Default Serving Runtime Version Supported Framework (Major) Version(s) Examples Custom ModelServer -- v1, v2 v2 -- -- Custom Model LightGBM MLServer Saved LightGBM Model v2 v2 v1.3.2 (MLServer) 3 LightGBM Iris V2 LightGBM ModelServer Saved LightGBM Model v1, v2 v2 v0.12 (KServe) 3 LightGBM Iris MLFlow ModelServer Saved MLFlow Model v2 v2 v1.3.2 (MLServer) 1 MLFLow wine-classifier PMML ModelServer PMML v1, v2 v2 v0.12 (KServe) 3, 4 ( PMML4.4.1 ) SKLearn PMML SKLearn MLServer Pickled Model v2 v2 v1.3.2 (MLServer) 1 SKLearn Iris V2 SKLearn ModelServer Pickled Model v1, v2 v2 v0.12 (KServe) 1.3 SKLearn Iris TFServing TensorFlow SavedModel v1 *tensorflow 2.6.2 ( TFServing Versions ) 2 TensorFlow flower TorchServe Eager Model/TorchScript v1, v2, *torchserve *torchserve 0.8.2 (TorchServe) 2 TorchServe mnist Triton Inference Server TensorFlow,TorchScript,ONNX v2 v2 23.05-py3 (Triton) 8 (TensoRT), 1, 2 (TensorFlow), 2 (PyTorch), 2 (Triton) Compatibility Matrix Torchscript cifar XGBoost MLServer Saved Model v2 v2 v1.3.2 (MLServer) 1 XGBoost Iris V2 XGBoost ModelServer Saved Model v1, v2 v2 v0.12 (KServe) 1 XGBoost Iris HuggingFace ModelServer Saved Model / Huggingface Hub Model_Id v1, v2 -- v0.12 (KServe) 4 ( Transformers ) -- HuggingFace VLLM ModelServer Saved Model / Huggingface Hub Model_Id v2 -- v0.12 (KServe) 0 ( Vllm ) -- *tensorflow - Tensorflow implements its own prediction protocol in addition to KServe's. See: Tensorflow Serving Prediction API documentation *torchserve - PyTorch implements its own prediction protocol in addition to KServe's. See: Torchserve gRPC API documentation Note The model serving runtime version can be overwritten with the runtimeVersion field on InferenceService yaml and we highly recommend setting this field for production services. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3","title":"Model Serving Runtimes"},{"location":"modelserving/v1beta1/amd/","text":"AMD Inference Server \u00b6 The AMD Inference Server is an easy-to-use inferencing solution specially designed for AMD CPUs, GPUs, and FPGAs. It can be deployed as a standalone executable or on a Kubernetes cluster with KServe or used to create custom applications by linking to its C++ API. This example demonstrates how to deploy a Tensorflow GraphDef model on KServe with the AMD Inference Server to run inference on AMD EPYC CPUs . Prerequisites \u00b6 This example was tested on an Ubuntu 18.04 host machine using the Bash shell. These instructions assume: You have a machine with a modern version of Docker (>=18.09) and sufficient disk space to build the image You have a Kubernetes cluster set up KServe has been installed on the Kubernetes cluster Some familiarity with Kubernetes / KServe Refer to the installation instructions for these tools to install them if needed. Set up the image \u00b6 This example uses the AMD ZenDNN backend to run inference on TensorFlow models on AMD EPYC CPUs. Build the image \u00b6 To build a Docker image for the AMD Inference Server that uses this backend, download the TF_v2.9_ZenDNN_v3.3_C++_API.zip package from ZenDNN. You must agree to the EULA to download this package. You need a modern version of Docker (at least 18.09) to build this image. # clone the inference server repository git clone https://github.com/Xilinx/inference-server.git # place the downloaded ZenDNN zip in the repository mv TF_v2.9_ZenDNN_v3.3_C++_API.zip ./inference-server/ # build the image cd inference-server ./amdinfer dockerize --production --tfzendnn = ./TF_v2.9_ZenDNN_v3.3_C++_API.zip This builds an image on your host: <username>/amdinfer:latest . To use with KServe, you need to upload this image to a Docker registry server such as on a local server . You will also need to update the YAML files in this example to use this image. More documentation for building a ZenDNN image for KServe is available: ZenDNN + AMD Inference Server and KServe + AMD Inference Server . Set up the model \u00b6 In this example, you will use an MNIST Tensorflow model . The AMD Inference Server also supports PyTorch, ONNX and Vitis AI models models with the appropriate Docker images. To prepare new models, look at the KServe + AMD Inference Server documentation for more information about the expected model format. Make an inference \u00b6 The AMD Inference Server can be used in single model serving mode in KServe. The code snippets below use the environment variables INGRESS_HOST and INGRESS_PORT to make requests to the cluster. Find the ingress host and port for making requests to your cluster and set these values appropriately. Add the ClusterServingRuntime \u00b6 To use the AMD Inference Server with KServe, add it as a serving runtime . A ClusterServingRuntime configuration file is included in this example. To apply it: # update the kserve-amdserver.yaml to use the right image # if you have a different image name, you'll need to edit it manually sed -i \"s/<image>/ $( whoami ) \\/amdinfer:latest/\" kserve-amdserver.yaml kubectl apply -f kserve-amdserver.yaml Single model serving \u00b6 Once the AMD Inference Server has been added as a serving runtime, you can start a service that uses it. # download the inference service file and input data curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/single_model.yaml curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/input.json # create the inference service kubectl apply -f single_model.yaml # wait for service to be ready kubectl wait --for = condition = ready isvc -l app = example-amdserver-runtime-isvc export SERVICE_HOSTNAME = $( kubectl get inferenceservice example-amdserver-runtime-isvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Make a request with REST \u00b6 Once the service is ready, you can make requests to it. Assuming that INGRESS_HOST , INGRESS_PORT , and SERVICE_HOSTNAME have been defined as above, the following command runs an inference over REST to the example MNIST model. export MODEL_NAME = mnist export INPUT_DATA = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d ${ INPUT_DATA } This shows the response from the server in KServe's v2 API format. For this example, it will be similar to: Expected Output { \"id\" : \"\" , \"model_name\" : \"TFModel\" , \"outputs\" : [ { \"data\" : [ 0.11987821012735367 , 0.18648317456245422 , -0.83796119689941406 , -0.088459312915802002 , 0.030454874038696289 , 0.074872657656669617 , -1.1334009170532227 , -0.046301722526550293 , -0.31683838367462158 , 0.32014602422714233 ], \"datatype\" : \"FP32\" , \"name\" : \"input-0\" , \"parameters\" :{}, \"shape\" :[ 10 ] } ] } For MNIST, the data indicates the likely classification for the input image, which is the number 9. In this response, the index with the highest value is the last one, indicating that the image was correctly classified as nine.","title":"AMD"},{"location":"modelserving/v1beta1/amd/#amd-inference-server","text":"The AMD Inference Server is an easy-to-use inferencing solution specially designed for AMD CPUs, GPUs, and FPGAs. It can be deployed as a standalone executable or on a Kubernetes cluster with KServe or used to create custom applications by linking to its C++ API. This example demonstrates how to deploy a Tensorflow GraphDef model on KServe with the AMD Inference Server to run inference on AMD EPYC CPUs .","title":"AMD Inference Server"},{"location":"modelserving/v1beta1/amd/#prerequisites","text":"This example was tested on an Ubuntu 18.04 host machine using the Bash shell. These instructions assume: You have a machine with a modern version of Docker (>=18.09) and sufficient disk space to build the image You have a Kubernetes cluster set up KServe has been installed on the Kubernetes cluster Some familiarity with Kubernetes / KServe Refer to the installation instructions for these tools to install them if needed.","title":"Prerequisites"},{"location":"modelserving/v1beta1/amd/#set-up-the-image","text":"This example uses the AMD ZenDNN backend to run inference on TensorFlow models on AMD EPYC CPUs.","title":"Set up the image"},{"location":"modelserving/v1beta1/amd/#build-the-image","text":"To build a Docker image for the AMD Inference Server that uses this backend, download the TF_v2.9_ZenDNN_v3.3_C++_API.zip package from ZenDNN. You must agree to the EULA to download this package. You need a modern version of Docker (at least 18.09) to build this image. # clone the inference server repository git clone https://github.com/Xilinx/inference-server.git # place the downloaded ZenDNN zip in the repository mv TF_v2.9_ZenDNN_v3.3_C++_API.zip ./inference-server/ # build the image cd inference-server ./amdinfer dockerize --production --tfzendnn = ./TF_v2.9_ZenDNN_v3.3_C++_API.zip This builds an image on your host: <username>/amdinfer:latest . To use with KServe, you need to upload this image to a Docker registry server such as on a local server . You will also need to update the YAML files in this example to use this image. More documentation for building a ZenDNN image for KServe is available: ZenDNN + AMD Inference Server and KServe + AMD Inference Server .","title":"Build the image"},{"location":"modelserving/v1beta1/amd/#set-up-the-model","text":"In this example, you will use an MNIST Tensorflow model . The AMD Inference Server also supports PyTorch, ONNX and Vitis AI models models with the appropriate Docker images. To prepare new models, look at the KServe + AMD Inference Server documentation for more information about the expected model format.","title":"Set up the model"},{"location":"modelserving/v1beta1/amd/#make-an-inference","text":"The AMD Inference Server can be used in single model serving mode in KServe. The code snippets below use the environment variables INGRESS_HOST and INGRESS_PORT to make requests to the cluster. Find the ingress host and port for making requests to your cluster and set these values appropriately.","title":"Make an inference"},{"location":"modelserving/v1beta1/amd/#add-the-clusterservingruntime","text":"To use the AMD Inference Server with KServe, add it as a serving runtime . A ClusterServingRuntime configuration file is included in this example. To apply it: # update the kserve-amdserver.yaml to use the right image # if you have a different image name, you'll need to edit it manually sed -i \"s/<image>/ $( whoami ) \\/amdinfer:latest/\" kserve-amdserver.yaml kubectl apply -f kserve-amdserver.yaml","title":"Add the ClusterServingRuntime"},{"location":"modelserving/v1beta1/amd/#single-model-serving","text":"Once the AMD Inference Server has been added as a serving runtime, you can start a service that uses it. # download the inference service file and input data curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/single_model.yaml curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/input.json # create the inference service kubectl apply -f single_model.yaml # wait for service to be ready kubectl wait --for = condition = ready isvc -l app = example-amdserver-runtime-isvc export SERVICE_HOSTNAME = $( kubectl get inferenceservice example-amdserver-runtime-isvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Single model serving"},{"location":"modelserving/v1beta1/amd/#make-a-request-with-rest","text":"Once the service is ready, you can make requests to it. Assuming that INGRESS_HOST , INGRESS_PORT , and SERVICE_HOSTNAME have been defined as above, the following command runs an inference over REST to the example MNIST model. export MODEL_NAME = mnist export INPUT_DATA = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d ${ INPUT_DATA } This shows the response from the server in KServe's v2 API format. For this example, it will be similar to: Expected Output { \"id\" : \"\" , \"model_name\" : \"TFModel\" , \"outputs\" : [ { \"data\" : [ 0.11987821012735367 , 0.18648317456245422 , -0.83796119689941406 , -0.088459312915802002 , 0.030454874038696289 , 0.074872657656669617 , -1.1334009170532227 , -0.046301722526550293 , -0.31683838367462158 , 0.32014602422714233 ], \"datatype\" : \"FP32\" , \"name\" : \"input-0\" , \"parameters\" :{}, \"shape\" :[ 10 ] } ] } For MNIST, the data indicates the likely classification for the input image, which is the number 9. In this response, the index with the highest value is the last one, indicating that the image was correctly classified as nine.","title":"Make a request with REST"},{"location":"modelserving/v1beta1/custom/custom_model/","text":"Deploy Custom Python Serving Runtime with InferenceService \u00b6 When the out-of-the-box Serving Runtime does not fit your need, you can choose to build your own model server using KServe ModelServer API to deploy as Custom Serving Runtime on KServe. Create and Deploy Custom REST ServingRuntime \u00b6 Setup \u00b6 Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository. Implement Custom Model using KServe API \u00b6 KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, the predictor handler executes the inference for your model, the postprocess handler then turns the raw prediction result into user-friendly inference response. There is an additional load handler which is used for writing custom code to load your model into the memory from local file system or remote model storage, a general good practice is to call the load handler in the model server class __init__ function, so your model is loaded on startup and ready to serve prediction requests. model.py import argparse import base64 import io import time from fastapi.middleware.cors import CORSMiddleware from torchvision import models , transforms from typing import Dict import torch from PIL import Image import kserve from kserve import Model , ModelServer , logging from kserve.model_server import app from kserve.utils.utils import generate_uuid class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name , return_response_headers = True ) super () . __init__ ( name , return_response_headers = True ) self . name = name super () . __init__ ( name , return_response_headers = True ) self . name = name self . load () self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None , response_headers : Dict [ str , str ] = None , ) -> Dict : start = time . time () # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values img_data = payload [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( img_data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ([ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ]), ]) input_tensor = preprocess ( input_image ) . unsqueeze ( 0 ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () end = time . time () response_id = generate_uuid () # Custom response headers can be added to the inference response if response_headers is not None : response_headers . update ( { \"prediction-time-latency\" : f \" { round (( end - start ) * 1000 , 9 ) } \" } ) return { \"predictions\" : result } parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) model = AlexNetModel ( args . model_name ) model . load () # Custom middlewares can be added to the model app . add_middleware ( CORSMiddleware , allow_origins = [ \"*\" ], allow_credentials = True , allow_methods = [ \"*\" ], allow_headers = [ \"*\" ], ) ModelServer () . start ([ model ]) Note return_response_headers=True can be added to return response headers for v1 and v2 endpoints Build Custom Serving Image with BuildPacks \u00b6 Buildpacks allows you to transform your inference code into images that can be deployed on KServe without needing to define the Dockerfile . Buildpacks automatically determines the python application and then install the dependencies from the requirements.txt file, it looks at the Procfile to determine how to start the model server. Here we are showing how to build the serving image manually with pack , you can also choose to use kpack to run the image build on the cloud and continuously build/deploy new versions from your source git repository. You can use pack cli to build and push the custom model server image pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model:v1 docker push ${ DOCKER_USER } /custom-model:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments. Deploy Locally and Test \u00b6 Launch the docker image built from last step. docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]} Deploy the REST Custom Serving Runtime on KServe \u00b6 custom.yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model:v1 In the custom.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username. Arguments \u00b6 You can supply additional command arguments on the container spec to configure the model server. --workers : Spawn the specified number of uvicorn workers(multi-processing) of the model server, the default value is 1, this option is often used to help increase the resource utilization of the container. --http_port : The http port model server is listening on, the default REST port is 8080. --grpc_port : The GRPC Port listened to by the model server. Default is 8081. --max_threads : The max number of gRPC processing threads. Default is 4. --enable_grpc : Enable gRPC for the model server. Default is true. --grpc_max_send_message_length : The max message length for gRPC send message. Default is 8388608 bytes (8 MB). --grpc_max_receive_message_length : The max message length for gRPC receive message. Default is 8388608 bytes (8 MB). --model_name : The model name deployed in the model server, the default name the same as the service name. --max_asyncio_workers : Max number of workers to spawn for python async io loop, by default it is min(32,cpu.limit + 4) . --enable_latency_logging : Whether to log latency metrics per request, the default is True. --configure_logging : Whether to configure KServe and Uvicorn logging, the default is True. --log_config_file : The path of the Python config file configuration to use (can be a json, a yaml file or any other supported file format by python logging module). This file allows to override the default Uvicorn configuration shipped with KServe. The default is None. --access_log_format : A string representing the access log format configuration to use. The functionality is provided by the asgi-logger library and it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). By default is None. enable_latency_logging : whether to log latency metrics per request, the default is True. --enable_docs_url : Enable docs url '/docs' to display Swagger UI. Environment Variables \u00b6 You can supply additional environment variables on the container spec. STORAGE_URI : load a model from a storage system supported by KServe e.g. pvc:// s3:// . This acts the same as storageUri when using a built-in predictor. The data will be available at /mnt/models in the container. For example, the following STORAGE_URI: \"pvc://my_model/model.onnx\" will be accessible at /mnt/models/model.onnx PROTOCOL : specify the protocol version supported by the model e.g V1 . This acts the same as protocolVersion when using a built-in predictor. KSERVE_LOGLEVEL : sets the kserve and kserve_trace 's logger verbosity. Default is INFO . Apply the YAML to deploy the InferenceService on KServe kubectl kubectl apply -f custom.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model created Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output * Trying 169 .47.250.204... * TCP_NODELAY set * Connected to 169 .47.250.204 ( 169 .47.250.204 ) port 80 ( #0) > POST /v1/models/custom-model:predict HTTP/1.1 > Host: custom-model.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 105339 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 232 < content-type: text/html ; charset = UTF-8 < date: Wed, 26 Feb 2020 15 :19:15 GMT < server: istio-envoy < x-envoy-upstream-service-time: 213 < * Connection #0 to host 169.47.250.204 left intact { \"predictions\" : [[ 14 .861762046813965, 13 .942917823791504, 13 .9243803024292, 12 .182711601257324, 12 .00634765625 ]]} Delete the InferenceService \u00b6 kubectl delete -f custom.yaml Create and Deploy Custom gRPC ServingRuntime \u00b6 KServe gRPC ServingRuntimes enables high performance inference data plane which implements the Open(v2) Inference Protocol : gRPC is built on top of HTTP/2 for addressing the shortcomings of head-of-line-blocking and pipelining , gRPC transports binary data format with Protobuf which is efficient to send over the wire. Compared to REST it has limited support for browser and the message is not human-readable which requires additional debugging tools. Setup \u00b6 Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository. Implement Custom Model using KServe API \u00b6 For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. model_grpc.py import argparse import io from typing import Dict import torch from PIL import Image from torchvision import models , transforms from kserve import InferRequest , InferResponse , Model , ModelServer , logging , model_server from kserve.utils.utils import get_predict_response # This custom predictor example implements the custom model following KServe # v2 inference gPPC protocol, the input can be raw image bytes or image tensor # which is pre-processed by transformer and then passed to predictor, the # output is the prediction response. class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name ) self . load () self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : InferRequest , headers : Dict [ str , str ] = None , response_headers : Dict [ str , str ] = None , ) -> InferResponse : req = payload . inputs [ 0 ] if req . datatype == \"BYTES\" : input_image = Image . open ( io . BytesIO ( req . data [ 0 ])) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_tensor = input_tensor . unsqueeze ( 0 ) elif req . datatype == \"FP32\" : np_array = payload . inputs [ 0 ] . as_numpy () input_tensor = torch . Tensor ( np_array ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . detach () . numpy () return get_predict_response ( payload , result , self . name ) parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) model = AlexNetModel ( args . model_name ) model . load () ModelServer () . start ([ model ]) Build Custom Serving Image with BuildPacks \u00b6 Similar to building the REST custom image, you can also use pack cli to build and push the custom gRPC model server image pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model-grpc:v1 docker push ${ DOCKER_USER } /custom-model-grpc:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments. Deploy Locally and Test \u00b6 Launch the docker image built from last step with buildpack . docker run -ePORT = 8081 -p8081:8081 ${ DOCKER_USER } /custom-model-grpc:v1 Send a test inference request locally using InferenceServerClient grpc_client.py grpc_client.py import asyncio import json import base64 import os from kserve import InferRequest , InferInput from kserve.inference_client import InferenceGRPCClient async def main (): client = InferenceGRPCClient ( url = os . environ . get ( \"INGRESS_HOST\" , \"localhost\" ) + \":\" + os . environ . get ( \"INGRESS_PORT\" , \"8081\" ), channel_args = [( 'grpc.ssl_target_name_override' , os . environ . get ( \"SERVICE_HOSTNAME\" , \"\" ))] ) with open ( \"../input.json\" ) as json_file : data = json . load ( json_file ) infer_input = InferInput ( name = \"input-0\" , shape = [ 1 ], datatype = \"BYTES\" , data = [ base64 . b64decode ( data [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ])]) request = InferRequest ( infer_inputs = [ infer_input ], model_name = os . environ . get ( \"MODEL_NAME\" , \"custom-model\" )) res = await client . infer ( infer_request = request ) print ( res ) asyncio . run ( main ()) python grpc_client.py Expected Output \"id\": \"b6a08abf-dcec-42ae-81af-084d9cad1c16\",\"model_name\": \"custom-model\",\"outputs\": [\"name\": \"output-0\",\"shape\": [1, 5],\"datatype\": \"FP32\",\"data\": [14.975618362426758, 14.036808967590332, 13.966032028198242, 12.252279281616211, 12.086268424987793],\"parameters\": {}],\"parameters\": {},\"from_grpc\": True Deploy the gRPC Custom Serving Runtime on KServe \u00b6 Create the InferenceService yaml and expose the gRPC port by specifying on ports section, currently only one port is allowed to expose and by default HTTP port is exposed. custom_grpc.yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model-grpc spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model-grpc:v1 ports : - name : h2c containerPort : 8081 protocol : TCP In the custom_grpc.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username. Arguments \u00b6 You can supply additional command arguments on the container spec to configure the model server. --grpc_port : the http port model server is listening on, the default gRPC port is 8081. --model_name : the model name deployed in the model server, the default name the same as the service name. Apply the yaml to deploy the InferenceService on KServe kubectl kubectl apply -f custom_grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model-grpc created Run a gRPC Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model SERVICE_HOSTNAME = $( kubectl get inferenceservice custom-model-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Send an inference request to the gRPC service using InferenceServerClient grpc_client.py . python grpc_client.py Expected Output \"id\": \"b6a08abf-dcec-42ae-81af-084d9cad1c16\",\"model_name\": \"custom-model\",\"outputs\": [\"name\": \"output-0\",\"shape\": [1, 5],\"datatype\": \"FP32\",\"data\": [14.975618362426758, 14.036808967590332, 13.966032028198242, 12.252279281616211, 12.086268424987793],\"parameters\": {}],\"parameters\": {},\"from_grpc\": True Parallel Model Inference \u00b6 By default, the models are loaded in the same process and inference is executed in the same process as the HTTP or gRPC server, if you are hosting multiple models the inference can only be run for one model at a time which limits the concurrency when you share the container for the models. KServe integrates RayServe which provides a programmable API to deploy models as separate python workers so, the inference can be performed in parallel when serving multiple custom models. Setup \u00b6 Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository. model_remote.py import argparse import base64 import io from typing import Dict from torchvision import models , transforms import torch from PIL import Image from ray import serve from kserve import Model , ModelServer , logging , model_server from kserve.ray import RayModel # the model handle name should match the model endpoint name @serve . deployment ( name = \"custom-model\" , num_replicas = 1 ) class AlexNetModel ( Model ): def __init__ ( self , name ): super () . __init__ ( name ) self . ready = False self . load () def load ( self ): self . model = models . alexnet ( pretrained = True , progress = False ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : inputs = payload [ \"instances\" ] # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values data = inputs [ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_batch = input_tensor . unsqueeze ( 0 ) output = self . model ( input_batch ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) return { \"predictions\" : values . tolist ()} parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) app = AlexNetModel . bind ( name = args . model_name ) handle = serve . run ( app ) model = RayModel ( name = args . model_name , handle = handle ) model . load () ModelServer () . start ([ model ]) Fractional GPU example \u00b6 import argparse import base64 import io from typing import Dict from torchvision import models , transforms import torch from PIL import Image import ray from ray import serve from kserve import Model , ModelServer , logging , model_server from kserve.ray import RayModel # the model handle name should match the model endpoint name @serve . deployment ( name = \"custom-model\" , num_replicas = 1 , ray_actor_options = { \"num_cpus\" : 1 , \"num_gpus\" : 0.5 }) class AlexNetModel ( Model ): def __init__ ( self , name ): super () . __init__ ( name ) self . ready = False self . load () def load ( self ): self . model = models . alexnet ( pretrained = True , progress = False ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : inputs = payload [ \"instances\" ] # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values data = inputs [ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_batch = input_tensor . unsqueeze ( 0 ) output = self . model ( input_batch ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) return { \"predictions\" : values . tolist ()} parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) ray . init ( num_cpus = 2 , num_gpus = 1 ) app = AlexNetModel . bind ( name = args . model_name ) handle = serve . run ( app ) model = RayModel ( name = args . model_name , handle = handle ) model . load () ModelServer () . start ([ model ]) The more details for ray fractional cpu and gpu can be found here . Build Custom Serving Image with BuildPacks \u00b6 You can use pack cli to build the serving image which launches each model as separate python worker and web server routes to the model workers by name. pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model-ray:v1 docker push ${ DOCKER_USER } /custom-model-ray:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments. Deploy Locally and Test \u00b6 Launch the docker image built from last step. docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model-ray:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]} Configuring Logger for Custom Serving Runtime \u00b6 KServe allows users to override the default logger configuration of serving runtime and uvicorn server. The logger configuration can be modified in one of the following ways: 1. Providing logger configuration as a Dict : \u00b6 If you are building a custom serving runtime and want to modify the logger configuration, this method offers the easiest solution. You can supply the logging configuration as a Python Dictionary to the kserve.logging.configure_logging() method. If the logging dictionary is not provided, KServe uses the default configuration KSERVE_LOG_CONFIG . import argparse import kserve from kserve import logging ################################# # Source code # ################################ parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Example Dict config dictConfig = { \"version\" : 1 , \"disable_existing_loggers\" : False , \"formatters\" : { \"kserve\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(filename)s : %(funcName)s (): %(lineno)s %(message)s \" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , }, \"kserve_trace\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(name)s %(message)s \" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , }, \"uvicorn\" : { \"()\" : \"uvicorn.logging.DefaultFormatter\" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(name)s %(levelprefix)s %(message)s \" , \"use_colors\" : None , }, }, \"handlers\" : { \"kserve\" : { \"formatter\" : \"kserve\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, \"kserve_trace\" : { \"formatter\" : \"kserve_trace\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, \"uvicorn\" : { \"formatter\" : \"uvicorn\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, }, \"loggers\" : { \"kserve\" : { \"handlers\" : [ \"kserve\" ], \"level\" : \"INFO\" , \"propagate\" : False , }, \"kserve.trace\" : { \"handlers\" : [ \"kserve_trace\" ], \"level\" : \"INFO\" , \"propagate\" : False , }, \"uvicorn\" : { \"handlers\" : [ \"uvicorn\" ], \"level\" : \"INFO\" , \"propagate\" : False }, }, } if args . configure_logging : logging . configure_logging ( dictConfig ) Note The logger should be configured before doing any actual work. A recommended best practice is to configure the logger in the main, preferably as the first line of code. If the logger is configured later on in the source code, it may lead to inconsistent logger formats. 2. Providing logger configuration as a file: \u00b6 The logger configuration can be provided as a file. If the filename ends with .json , KServe will treat the file as JSON Configuration. If the filename ends with .yaml or .yml , KServe will treat the file as YAML Configuration. Otherwise, The file will be treated as a configuration file in the format specified in the Python logging module documentation . This offers a more flexible way of configuring the logger for pre-built serving runtimes. The model server offers a command line argument which accepts a file path pointing to the configuration. For example, sklearnserver --log_config_file = /path/to/config.yaml For, Custom serving runtimes, they should accept the file path in their source code. import argparse from kserve import logging import kserve ################################# # Source code # ################################ parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) Here is an example logging config in JSON format. { \"version\" : 1 , \"disable_existing_loggers\" : false , \"formatters\" : { \"kserve\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(filename)s:%(funcName)s():%(lineno)s %(message)s\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" }, \"kserve_trace\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(name)s %(message)s\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" }, \"uvicorn\" : { \"()\" : \"uvicorn.logging.DefaultFormatter\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(name)s %(levelprefix)s %(message)s\" , \"use_colors\" : null } }, \"handlers\" : { \"kserve\" : { \"formatter\" : \"kserve\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" }, \"kserve_trace\" : { \"formatter\" : \"kserve_trace\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" }, \"uvicorn\" : { \"formatter\" : \"uvicorn\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" } }, \"loggers\" : { \"kserve\" : { \"handlers\" : [ \"kserve\" ], \"level\" : \"INFO\" , \"propagate\" : false }, \"kserve.trace\" : { \"handlers\" : [ \"kserve_trace\" ], \"level\" : \"INFO\" , \"propagate\" : false }, \"uvicorn\" : { \"handlers\" : [ \"uvicorn\" ], \"level\" : \"INFO\" , \"propagate\" : false } } } Here is an example using YAML format for configuring logger. version : 1 disable_existing_loggers : false formatters : kserve : \"()\" : logging.Formatter fmt : \"%(asctime)s.%(msecs)03d %(filename)s:%(funcName)s():%(lineno)s %(message)s\" datefmt : \"%Y-%m-%d %H:%M:%S\" kserve_trace : \"()\" : logging.Formatter fmt : \"%(asctime)s.%(msecs)03d %(name)s %(message)s\" datefmt : \"%Y-%m-%d %H:%M:%S\" uvicorn : \"()\" : uvicorn.logging.DefaultFormatter datefmt : \"%Y-%m-%d %H:%M:%S\" fmt : \"%(asctime)s.%(msecs)03d %(name)s %(levelprefix)s %(message)s\" use_colors : null handlers : kserve : formatter : kserve class : logging.StreamHandler stream : ext://sys.stderr kserve_trace : formatter : kserve_trace class : logging.StreamHandler stream : ext://sys.stderr uvicorn : formatter : uvicorn class : logging.StreamHandler stream : ext://sys.stderr loggers : kserve : handlers : - kserve level : INFO propagate : false kserve.trace : handlers : - kserve_trace level : INFO propagate : false uvicorn : handlers : - uvicorn level : INFO propagate : false For other file formats, Please refer Python docs . 3. Disabling logger Configuration: \u00b6 If you don't want Kserve to configure the logger then, You can disable it by passing the commandline argument --configure_logging=False to the model server. The command line argument --log_config_file will be ignored, if the logger configuration is disabled. In this case, the logger will inherit the root logger's configuration. sklearnserver --configure_logging = False Note If the logger is not configured at the entrypoint in the serving runtime (i.e. logging.configure_logger() is not invoked), The model server will configure the logger using default configuration. But note that the logger is configured at model server initialization. So any logs before the initialization will use the root logger's configuration.","title":"How to write a custom predictor"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-custom-python-serving-runtime-with-inferenceservice","text":"When the out-of-the-box Serving Runtime does not fit your need, you can choose to build your own model server using KServe ModelServer API to deploy as Custom Serving Runtime on KServe.","title":"Deploy Custom Python Serving Runtime with InferenceService"},{"location":"modelserving/v1beta1/custom/custom_model/#create-and-deploy-custom-rest-servingruntime","text":"","title":"Create and Deploy Custom REST ServingRuntime"},{"location":"modelserving/v1beta1/custom/custom_model/#setup","text":"Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository.","title":"Setup"},{"location":"modelserving/v1beta1/custom/custom_model/#implement-custom-model-using-kserve-api","text":"KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, the predictor handler executes the inference for your model, the postprocess handler then turns the raw prediction result into user-friendly inference response. There is an additional load handler which is used for writing custom code to load your model into the memory from local file system or remote model storage, a general good practice is to call the load handler in the model server class __init__ function, so your model is loaded on startup and ready to serve prediction requests. model.py import argparse import base64 import io import time from fastapi.middleware.cors import CORSMiddleware from torchvision import models , transforms from typing import Dict import torch from PIL import Image import kserve from kserve import Model , ModelServer , logging from kserve.model_server import app from kserve.utils.utils import generate_uuid class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name , return_response_headers = True ) super () . __init__ ( name , return_response_headers = True ) self . name = name super () . __init__ ( name , return_response_headers = True ) self . name = name self . load () self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None , response_headers : Dict [ str , str ] = None , ) -> Dict : start = time . time () # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values img_data = payload [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( img_data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ([ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ]), ]) input_tensor = preprocess ( input_image ) . unsqueeze ( 0 ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () end = time . time () response_id = generate_uuid () # Custom response headers can be added to the inference response if response_headers is not None : response_headers . update ( { \"prediction-time-latency\" : f \" { round (( end - start ) * 1000 , 9 ) } \" } ) return { \"predictions\" : result } parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) model = AlexNetModel ( args . model_name ) model . load () # Custom middlewares can be added to the model app . add_middleware ( CORSMiddleware , allow_origins = [ \"*\" ], allow_credentials = True , allow_methods = [ \"*\" ], allow_headers = [ \"*\" ], ) ModelServer () . start ([ model ]) Note return_response_headers=True can be added to return response headers for v1 and v2 endpoints","title":"Implement Custom Model using KServe API"},{"location":"modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks","text":"Buildpacks allows you to transform your inference code into images that can be deployed on KServe without needing to define the Dockerfile . Buildpacks automatically determines the python application and then install the dependencies from the requirements.txt file, it looks at the Procfile to determine how to start the model server. Here we are showing how to build the serving image manually with pack , you can also choose to use kpack to run the image build on the cloud and continuously build/deploy new versions from your source git repository. You can use pack cli to build and push the custom model server image pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model:v1 docker push ${ DOCKER_USER } /custom-model:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments.","title":"Build Custom Serving Image with BuildPacks"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-locally-and-test","text":"Launch the docker image built from last step. docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]}","title":"Deploy Locally and Test"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-the-rest-custom-serving-runtime-on-kserve","text":"custom.yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model:v1 In the custom.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username.","title":"Deploy the REST Custom Serving Runtime on KServe"},{"location":"modelserving/v1beta1/custom/custom_model/#arguments","text":"You can supply additional command arguments on the container spec to configure the model server. --workers : Spawn the specified number of uvicorn workers(multi-processing) of the model server, the default value is 1, this option is often used to help increase the resource utilization of the container. --http_port : The http port model server is listening on, the default REST port is 8080. --grpc_port : The GRPC Port listened to by the model server. Default is 8081. --max_threads : The max number of gRPC processing threads. Default is 4. --enable_grpc : Enable gRPC for the model server. Default is true. --grpc_max_send_message_length : The max message length for gRPC send message. Default is 8388608 bytes (8 MB). --grpc_max_receive_message_length : The max message length for gRPC receive message. Default is 8388608 bytes (8 MB). --model_name : The model name deployed in the model server, the default name the same as the service name. --max_asyncio_workers : Max number of workers to spawn for python async io loop, by default it is min(32,cpu.limit + 4) . --enable_latency_logging : Whether to log latency metrics per request, the default is True. --configure_logging : Whether to configure KServe and Uvicorn logging, the default is True. --log_config_file : The path of the Python config file configuration to use (can be a json, a yaml file or any other supported file format by python logging module). This file allows to override the default Uvicorn configuration shipped with KServe. The default is None. --access_log_format : A string representing the access log format configuration to use. The functionality is provided by the asgi-logger library and it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). By default is None. enable_latency_logging : whether to log latency metrics per request, the default is True. --enable_docs_url : Enable docs url '/docs' to display Swagger UI.","title":"Arguments"},{"location":"modelserving/v1beta1/custom/custom_model/#environment-variables","text":"You can supply additional environment variables on the container spec. STORAGE_URI : load a model from a storage system supported by KServe e.g. pvc:// s3:// . This acts the same as storageUri when using a built-in predictor. The data will be available at /mnt/models in the container. For example, the following STORAGE_URI: \"pvc://my_model/model.onnx\" will be accessible at /mnt/models/model.onnx PROTOCOL : specify the protocol version supported by the model e.g V1 . This acts the same as protocolVersion when using a built-in predictor. KSERVE_LOGLEVEL : sets the kserve and kserve_trace 's logger verbosity. Default is INFO . Apply the YAML to deploy the InferenceService on KServe kubectl kubectl apply -f custom.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model created","title":"Environment Variables"},{"location":"modelserving/v1beta1/custom/custom_model/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output * Trying 169 .47.250.204... * TCP_NODELAY set * Connected to 169 .47.250.204 ( 169 .47.250.204 ) port 80 ( #0) > POST /v1/models/custom-model:predict HTTP/1.1 > Host: custom-model.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 105339 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 232 < content-type: text/html ; charset = UTF-8 < date: Wed, 26 Feb 2020 15 :19:15 GMT < server: istio-envoy < x-envoy-upstream-service-time: 213 < * Connection #0 to host 169.47.250.204 left intact { \"predictions\" : [[ 14 .861762046813965, 13 .942917823791504, 13 .9243803024292, 12 .182711601257324, 12 .00634765625 ]]}","title":"Run a Prediction"},{"location":"modelserving/v1beta1/custom/custom_model/#delete-the-inferenceservice","text":"kubectl delete -f custom.yaml","title":"Delete the InferenceService"},{"location":"modelserving/v1beta1/custom/custom_model/#create-and-deploy-custom-grpc-servingruntime","text":"KServe gRPC ServingRuntimes enables high performance inference data plane which implements the Open(v2) Inference Protocol : gRPC is built on top of HTTP/2 for addressing the shortcomings of head-of-line-blocking and pipelining , gRPC transports binary data format with Protobuf which is efficient to send over the wire. Compared to REST it has limited support for browser and the message is not human-readable which requires additional debugging tools.","title":"Create and Deploy Custom gRPC ServingRuntime"},{"location":"modelserving/v1beta1/custom/custom_model/#setup_1","text":"Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository.","title":"Setup"},{"location":"modelserving/v1beta1/custom/custom_model/#implement-custom-model-using-kserve-api_1","text":"For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. model_grpc.py import argparse import io from typing import Dict import torch from PIL import Image from torchvision import models , transforms from kserve import InferRequest , InferResponse , Model , ModelServer , logging , model_server from kserve.utils.utils import get_predict_response # This custom predictor example implements the custom model following KServe # v2 inference gPPC protocol, the input can be raw image bytes or image tensor # which is pre-processed by transformer and then passed to predictor, the # output is the prediction response. class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name ) self . load () self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : InferRequest , headers : Dict [ str , str ] = None , response_headers : Dict [ str , str ] = None , ) -> InferResponse : req = payload . inputs [ 0 ] if req . datatype == \"BYTES\" : input_image = Image . open ( io . BytesIO ( req . data [ 0 ])) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_tensor = input_tensor . unsqueeze ( 0 ) elif req . datatype == \"FP32\" : np_array = payload . inputs [ 0 ] . as_numpy () input_tensor = torch . Tensor ( np_array ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . detach () . numpy () return get_predict_response ( payload , result , self . name ) parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) model = AlexNetModel ( args . model_name ) model . load () ModelServer () . start ([ model ])","title":"Implement Custom Model using KServe API"},{"location":"modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks_1","text":"Similar to building the REST custom image, you can also use pack cli to build and push the custom gRPC model server image pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model-grpc:v1 docker push ${ DOCKER_USER } /custom-model-grpc:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments.","title":"Build Custom Serving Image with BuildPacks"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-locally-and-test_1","text":"Launch the docker image built from last step with buildpack . docker run -ePORT = 8081 -p8081:8081 ${ DOCKER_USER } /custom-model-grpc:v1 Send a test inference request locally using InferenceServerClient grpc_client.py grpc_client.py import asyncio import json import base64 import os from kserve import InferRequest , InferInput from kserve.inference_client import InferenceGRPCClient async def main (): client = InferenceGRPCClient ( url = os . environ . get ( \"INGRESS_HOST\" , \"localhost\" ) + \":\" + os . environ . get ( \"INGRESS_PORT\" , \"8081\" ), channel_args = [( 'grpc.ssl_target_name_override' , os . environ . get ( \"SERVICE_HOSTNAME\" , \"\" ))] ) with open ( \"../input.json\" ) as json_file : data = json . load ( json_file ) infer_input = InferInput ( name = \"input-0\" , shape = [ 1 ], datatype = \"BYTES\" , data = [ base64 . b64decode ( data [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ])]) request = InferRequest ( infer_inputs = [ infer_input ], model_name = os . environ . get ( \"MODEL_NAME\" , \"custom-model\" )) res = await client . infer ( infer_request = request ) print ( res ) asyncio . run ( main ()) python grpc_client.py Expected Output \"id\": \"b6a08abf-dcec-42ae-81af-084d9cad1c16\",\"model_name\": \"custom-model\",\"outputs\": [\"name\": \"output-0\",\"shape\": [1, 5],\"datatype\": \"FP32\",\"data\": [14.975618362426758, 14.036808967590332, 13.966032028198242, 12.252279281616211, 12.086268424987793],\"parameters\": {}],\"parameters\": {},\"from_grpc\": True","title":"Deploy Locally and Test"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-the-grpc-custom-serving-runtime-on-kserve","text":"Create the InferenceService yaml and expose the gRPC port by specifying on ports section, currently only one port is allowed to expose and by default HTTP port is exposed. custom_grpc.yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model-grpc spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model-grpc:v1 ports : - name : h2c containerPort : 8081 protocol : TCP In the custom_grpc.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username.","title":"Deploy the gRPC Custom Serving Runtime on KServe"},{"location":"modelserving/v1beta1/custom/custom_model/#arguments_1","text":"You can supply additional command arguments on the container spec to configure the model server. --grpc_port : the http port model server is listening on, the default gRPC port is 8081. --model_name : the model name deployed in the model server, the default name the same as the service name. Apply the yaml to deploy the InferenceService on KServe kubectl kubectl apply -f custom_grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model-grpc created","title":"Arguments"},{"location":"modelserving/v1beta1/custom/custom_model/#run-a-grpc-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model SERVICE_HOSTNAME = $( kubectl get inferenceservice custom-model-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Send an inference request to the gRPC service using InferenceServerClient grpc_client.py . python grpc_client.py Expected Output \"id\": \"b6a08abf-dcec-42ae-81af-084d9cad1c16\",\"model_name\": \"custom-model\",\"outputs\": [\"name\": \"output-0\",\"shape\": [1, 5],\"datatype\": \"FP32\",\"data\": [14.975618362426758, 14.036808967590332, 13.966032028198242, 12.252279281616211, 12.086268424987793],\"parameters\": {}],\"parameters\": {},\"from_grpc\": True","title":"Run a gRPC Prediction"},{"location":"modelserving/v1beta1/custom/custom_model/#parallel-model-inference","text":"By default, the models are loaded in the same process and inference is executed in the same process as the HTTP or gRPC server, if you are hosting multiple models the inference can only be run for one model at a time which limits the concurrency when you share the container for the models. KServe integrates RayServe which provides a programmable API to deploy models as separate python workers so, the inference can be performed in parallel when serving multiple custom models.","title":"Parallel Model Inference"},{"location":"modelserving/v1beta1/custom/custom_model/#setup_2","text":"Install pack CLI to build your custom model server image. The code samples can be found in the KServe website repository. model_remote.py import argparse import base64 import io from typing import Dict from torchvision import models , transforms import torch from PIL import Image from ray import serve from kserve import Model , ModelServer , logging , model_server from kserve.ray import RayModel # the model handle name should match the model endpoint name @serve . deployment ( name = \"custom-model\" , num_replicas = 1 ) class AlexNetModel ( Model ): def __init__ ( self , name ): super () . __init__ ( name ) self . ready = False self . load () def load ( self ): self . model = models . alexnet ( pretrained = True , progress = False ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : inputs = payload [ \"instances\" ] # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values data = inputs [ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_batch = input_tensor . unsqueeze ( 0 ) output = self . model ( input_batch ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) return { \"predictions\" : values . tolist ()} parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) app = AlexNetModel . bind ( name = args . model_name ) handle = serve . run ( app ) model = RayModel ( name = args . model_name , handle = handle ) model . load () ModelServer () . start ([ model ])","title":"Setup"},{"location":"modelserving/v1beta1/custom/custom_model/#fractional-gpu-example","text":"import argparse import base64 import io from typing import Dict from torchvision import models , transforms import torch from PIL import Image import ray from ray import serve from kserve import Model , ModelServer , logging , model_server from kserve.ray import RayModel # the model handle name should match the model endpoint name @serve . deployment ( name = \"custom-model\" , num_replicas = 1 , ray_actor_options = { \"num_cpus\" : 1 , \"num_gpus\" : 0.5 }) class AlexNetModel ( Model ): def __init__ ( self , name ): super () . __init__ ( name ) self . ready = False self . load () def load ( self ): self . model = models . alexnet ( pretrained = True , progress = False ) self . model . eval () # The ready flag is used by model ready endpoint for readiness probes, # set to True when model is loaded successfully without exceptions. self . ready = True async def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : inputs = payload [ \"instances\" ] # Input follows the Tensorflow V1 HTTP API for binary values # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values data = inputs [ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ( [ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ] ), ] ) input_tensor = preprocess ( input_image ) input_batch = input_tensor . unsqueeze ( 0 ) output = self . model ( input_batch ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) return { \"predictions\" : values . tolist ()} parser = argparse . ArgumentParser ( parents = [ model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Configure kserve and uvicorn logger if args . configure_logging : logging . configure_logging ( args . log_config_file ) ray . init ( num_cpus = 2 , num_gpus = 1 ) app = AlexNetModel . bind ( name = args . model_name ) handle = serve . run ( app ) model = RayModel ( name = args . model_name , handle = handle ) model . load () ModelServer () . start ([ model ]) The more details for ray fractional cpu and gpu can be found here .","title":"Fractional GPU example"},{"location":"modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks_2","text":"You can use pack cli to build the serving image which launches each model as separate python worker and web server routes to the model workers by name. pack build --builder = heroku/builder:24 ${ DOCKER_USER } /custom-model-ray:v1 docker push ${ DOCKER_USER } /custom-model-ray:v1 Tip If your buildpack command fails, make sure you have a .python-version file with the correct python version specified and a Procfile with correct entrypoint and arguments.","title":"Build Custom Serving Image with BuildPacks"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-locally-and-test_2","text":"Launch the docker image built from last step. docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model-ray:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]}","title":"Deploy Locally and Test"},{"location":"modelserving/v1beta1/custom/custom_model/#configuring-logger-for-custom-serving-runtime","text":"KServe allows users to override the default logger configuration of serving runtime and uvicorn server. The logger configuration can be modified in one of the following ways:","title":"Configuring Logger for Custom Serving Runtime"},{"location":"modelserving/v1beta1/custom/custom_model/#1-providing-logger-configuration-as-a-dict","text":"If you are building a custom serving runtime and want to modify the logger configuration, this method offers the easiest solution. You can supply the logging configuration as a Python Dictionary to the kserve.logging.configure_logging() method. If the logging dictionary is not provided, KServe uses the default configuration KSERVE_LOG_CONFIG . import argparse import kserve from kserve import logging ################################# # Source code # ################################ parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : # Example Dict config dictConfig = { \"version\" : 1 , \"disable_existing_loggers\" : False , \"formatters\" : { \"kserve\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(filename)s : %(funcName)s (): %(lineno)s %(message)s \" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , }, \"kserve_trace\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(name)s %(message)s \" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , }, \"uvicorn\" : { \"()\" : \"uvicorn.logging.DefaultFormatter\" , \"datefmt\" : \"%Y-%m- %d %H:%M:%S\" , \"fmt\" : \" %(asctime)s . %(msecs)03d %(name)s %(levelprefix)s %(message)s \" , \"use_colors\" : None , }, }, \"handlers\" : { \"kserve\" : { \"formatter\" : \"kserve\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, \"kserve_trace\" : { \"formatter\" : \"kserve_trace\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, \"uvicorn\" : { \"formatter\" : \"uvicorn\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" , }, }, \"loggers\" : { \"kserve\" : { \"handlers\" : [ \"kserve\" ], \"level\" : \"INFO\" , \"propagate\" : False , }, \"kserve.trace\" : { \"handlers\" : [ \"kserve_trace\" ], \"level\" : \"INFO\" , \"propagate\" : False , }, \"uvicorn\" : { \"handlers\" : [ \"uvicorn\" ], \"level\" : \"INFO\" , \"propagate\" : False }, }, } if args . configure_logging : logging . configure_logging ( dictConfig ) Note The logger should be configured before doing any actual work. A recommended best practice is to configure the logger in the main, preferably as the first line of code. If the logger is configured later on in the source code, it may lead to inconsistent logger formats.","title":"1. Providing logger configuration as a Dict:"},{"location":"modelserving/v1beta1/custom/custom_model/#2-providing-logger-configuration-as-a-file","text":"The logger configuration can be provided as a file. If the filename ends with .json , KServe will treat the file as JSON Configuration. If the filename ends with .yaml or .yml , KServe will treat the file as YAML Configuration. Otherwise, The file will be treated as a configuration file in the format specified in the Python logging module documentation . This offers a more flexible way of configuring the logger for pre-built serving runtimes. The model server offers a command line argument which accepts a file path pointing to the configuration. For example, sklearnserver --log_config_file = /path/to/config.yaml For, Custom serving runtimes, they should accept the file path in their source code. import argparse from kserve import logging import kserve ################################# # Source code # ################################ parser = argparse . ArgumentParser ( parents = [ kserve . model_server . parser ]) args , _ = parser . parse_known_args () if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) Here is an example logging config in JSON format. { \"version\" : 1 , \"disable_existing_loggers\" : false , \"formatters\" : { \"kserve\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(filename)s:%(funcName)s():%(lineno)s %(message)s\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" }, \"kserve_trace\" : { \"()\" : \"logging.Formatter\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(name)s %(message)s\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" }, \"uvicorn\" : { \"()\" : \"uvicorn.logging.DefaultFormatter\" , \"datefmt\" : \"%Y-%m-%d %H:%M:%S\" , \"fmt\" : \"%(asctime)s.%(msecs)03d %(name)s %(levelprefix)s %(message)s\" , \"use_colors\" : null } }, \"handlers\" : { \"kserve\" : { \"formatter\" : \"kserve\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" }, \"kserve_trace\" : { \"formatter\" : \"kserve_trace\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" }, \"uvicorn\" : { \"formatter\" : \"uvicorn\" , \"class\" : \"logging.StreamHandler\" , \"stream\" : \"ext://sys.stderr\" } }, \"loggers\" : { \"kserve\" : { \"handlers\" : [ \"kserve\" ], \"level\" : \"INFO\" , \"propagate\" : false }, \"kserve.trace\" : { \"handlers\" : [ \"kserve_trace\" ], \"level\" : \"INFO\" , \"propagate\" : false }, \"uvicorn\" : { \"handlers\" : [ \"uvicorn\" ], \"level\" : \"INFO\" , \"propagate\" : false } } } Here is an example using YAML format for configuring logger. version : 1 disable_existing_loggers : false formatters : kserve : \"()\" : logging.Formatter fmt : \"%(asctime)s.%(msecs)03d %(filename)s:%(funcName)s():%(lineno)s %(message)s\" datefmt : \"%Y-%m-%d %H:%M:%S\" kserve_trace : \"()\" : logging.Formatter fmt : \"%(asctime)s.%(msecs)03d %(name)s %(message)s\" datefmt : \"%Y-%m-%d %H:%M:%S\" uvicorn : \"()\" : uvicorn.logging.DefaultFormatter datefmt : \"%Y-%m-%d %H:%M:%S\" fmt : \"%(asctime)s.%(msecs)03d %(name)s %(levelprefix)s %(message)s\" use_colors : null handlers : kserve : formatter : kserve class : logging.StreamHandler stream : ext://sys.stderr kserve_trace : formatter : kserve_trace class : logging.StreamHandler stream : ext://sys.stderr uvicorn : formatter : uvicorn class : logging.StreamHandler stream : ext://sys.stderr loggers : kserve : handlers : - kserve level : INFO propagate : false kserve.trace : handlers : - kserve_trace level : INFO propagate : false uvicorn : handlers : - uvicorn level : INFO propagate : false For other file formats, Please refer Python docs .","title":"2. Providing logger configuration as a file:"},{"location":"modelserving/v1beta1/custom/custom_model/#3-disabling-logger-configuration","text":"If you don't want Kserve to configure the logger then, You can disable it by passing the commandline argument --configure_logging=False to the model server. The command line argument --log_config_file will be ignored, if the logger configuration is disabled. In this case, the logger will inherit the root logger's configuration. sklearnserver --configure_logging = False Note If the logger is not configured at the entrypoint in the serving runtime (i.e. logging.configure_logger() is not invoked), The model server will configure the logger using default configuration. But note that the logger is configured at model server initialization. So any logs before the initialization will use the root logger's configuration.","title":"3. Disabling logger Configuration:"},{"location":"modelserving/v1beta1/lightgbm/","text":"Deploy LightGBM model with InferenceService \u00b6 Train a LightGBM model \u00b6 To test the LightGBM Server, first you need to train a simple LightGBM model with following python code. import lightgbm as lgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = lgb . Dataset ( X , label = y , feature_names = iris [ 'feature_names' ]) params = { 'objective' : 'multiclass' , 'metric' : 'softmax' , 'num_class' : 3 } lgb_model = lgb . train ( params = params , train_set = dtrain ) model_file = os . path . join ( model_dir , BST_FILE ) lgb_model . save_model ( model_file ) Deploy LightGBM model with V1 protocol \u00b6 Test the model locally \u00b6 Install and run the LightGBM Server using the trained model locally and test the prediction. python -m lgbserver --model_dir /path/to/model_dir --model_name lgb After the LightGBM Server is up locally we can then test the model by sending an inference request. import requests request = { 'sepal_width_(cm)' : { 0 : 3.5 }, 'petal_length_(cm)' : { 0 : 1.4 }, 'petal_width_(cm)' : { 0 : 0.2 }, 'sepal_length_(cm)' : { 0 : 5.1 } } formData = { 'inputs' : [ request ] } res = requests . post ( 'http://localhost:8080/v1/models/lgb:predict' , json = formData ) print ( res ) print ( res . text ) Deploy with InferenceService \u00b6 To deploy the model on Kubernetes you can create the InferenceService by specifying the modelFormat with lightgbm and storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : model : modelFormat : name : lightgbm storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : lightgbm : storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" Apply the above yaml to create the InferenceService kubectl apply -f lightgbm.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-iris created Test the deployed model \u00b6 To test the deployed model the first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , then run the following curl command to send the inference request to the InferenceService . MODEL_NAME = lightgbm-iris INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 169 .63.251.68... * TCP_NODELAY set * Connected to 169 .63.251.68 ( 169 .63.251.68 ) port 80 ( #0) > POST /models/lightgbm-iris:predict HTTP/1.1 > Host: lightgbm-iris.default.svc.cluster.local > User-Agent: curl/7.60.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes < HTTP/1.1 200 OK < content-length: 27 < content-type: application/json ; charset = UTF-8 < date: Tue, 21 May 2019 22 :40:09 GMT < server: istio-envoy < x-envoy-upstream-service-time: 13032 < * Connection #0 to host 169.63.251.68 left intact { \"predictions\" : [[ 0 .9, 0 .05, 0 .05 ]]} Deploy the model with Open Inference Protocol \u00b6 Test the model locally \u00b6 Once you've got your model serialized model.bst , we can then use KServe LightGBM Server to create a local model server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use kserve lightgbm server locally, you will first need to install the lgbserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install lgbserver runtime. KServe uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/lgbserver poetry install Serving model locally \u00b6 The lgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the lgbserver runtime package installed locally, you should now be ready to start our server as: python3 lgbserver --model_dir /path/to/model_dir --model_name lightgbm-v2-iris Deploy InferenceService with REST endpoint \u00b6 To deploy the LightGBM model with Open Inference Protocol, you need to set the protocolVersion field to v2 . Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris\" spec : predictor : model : modelFormat : name : lightgbm runtime : kserve-lgbserver protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f lightgbm-v2.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-v2-iris created Test the deployed model with curl \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/lightgbm-v2-iris/infer Expected Output { \"model_name\" : \"lightgbm-v2-iris\" , \"model_version\" : null , \"id\" : \"96253e27-83cf-4262-b279-1bd4b18d7922\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 2 , 3 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 8.796664107010673e-06 , 0.9992300031041593 , 0.0007612002317336916 , 4.974786820804187e-06 , 0.9999919650711493 , 3.0601420299625077e-06 ] } ] } Create the InferenceService with gRPC endpoint \u00b6 Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f lightgbm-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follow the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"lightgbm-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Sun, 25 Sep 2022 10 :25:05 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 99 Estimated response size: 91 bytes Response contents: { \"modelName\" : \"lightgbm-v2-iris-grpc\" , \"outputs\" : [ { \"name\" : \"predict\" , \"datatype\" : \"FP64\" , \"shape\" : [ \"2\" , \"3\" ] , \"contents\" : { \"fp64Contents\" : [ 8 .796664107010673e-06, 0 .9992300031041593, 0 .0007612002317336916, 4 .974786820804187e-06, 0 .9999919650711493, 3 .0601420299625077e-06 ] } } ] }","title":"LightGBM"},{"location":"modelserving/v1beta1/lightgbm/#deploy-lightgbm-model-with-inferenceservice","text":"","title":"Deploy LightGBM model with InferenceService"},{"location":"modelserving/v1beta1/lightgbm/#train-a-lightgbm-model","text":"To test the LightGBM Server, first you need to train a simple LightGBM model with following python code. import lightgbm as lgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = lgb . Dataset ( X , label = y , feature_names = iris [ 'feature_names' ]) params = { 'objective' : 'multiclass' , 'metric' : 'softmax' , 'num_class' : 3 } lgb_model = lgb . train ( params = params , train_set = dtrain ) model_file = os . path . join ( model_dir , BST_FILE ) lgb_model . save_model ( model_file )","title":"Train a LightGBM model"},{"location":"modelserving/v1beta1/lightgbm/#deploy-lightgbm-model-with-v1-protocol","text":"","title":"Deploy LightGBM model with V1 protocol"},{"location":"modelserving/v1beta1/lightgbm/#test-the-model-locally","text":"Install and run the LightGBM Server using the trained model locally and test the prediction. python -m lgbserver --model_dir /path/to/model_dir --model_name lgb After the LightGBM Server is up locally we can then test the model by sending an inference request. import requests request = { 'sepal_width_(cm)' : { 0 : 3.5 }, 'petal_length_(cm)' : { 0 : 1.4 }, 'petal_width_(cm)' : { 0 : 0.2 }, 'sepal_length_(cm)' : { 0 : 5.1 } } formData = { 'inputs' : [ request ] } res = requests . post ( 'http://localhost:8080/v1/models/lgb:predict' , json = formData ) print ( res ) print ( res . text )","title":"Test the model locally"},{"location":"modelserving/v1beta1/lightgbm/#deploy-with-inferenceservice","text":"To deploy the model on Kubernetes you can create the InferenceService by specifying the modelFormat with lightgbm and storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : model : modelFormat : name : lightgbm storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : lightgbm : storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" Apply the above yaml to create the InferenceService kubectl apply -f lightgbm.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-iris created","title":"Deploy with InferenceService"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model","text":"To test the deployed model the first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , then run the following curl command to send the inference request to the InferenceService . MODEL_NAME = lightgbm-iris INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 169 .63.251.68... * TCP_NODELAY set * Connected to 169 .63.251.68 ( 169 .63.251.68 ) port 80 ( #0) > POST /models/lightgbm-iris:predict HTTP/1.1 > Host: lightgbm-iris.default.svc.cluster.local > User-Agent: curl/7.60.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes < HTTP/1.1 200 OK < content-length: 27 < content-type: application/json ; charset = UTF-8 < date: Tue, 21 May 2019 22 :40:09 GMT < server: istio-envoy < x-envoy-upstream-service-time: 13032 < * Connection #0 to host 169.63.251.68 left intact { \"predictions\" : [[ 0 .9, 0 .05, 0 .05 ]]}","title":"Test the deployed model"},{"location":"modelserving/v1beta1/lightgbm/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/lightgbm/#test-the-model-locally_1","text":"Once you've got your model serialized model.bst , we can then use KServe LightGBM Server to create a local model server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the model locally"},{"location":"modelserving/v1beta1/lightgbm/#pre-requisites","text":"Firstly, to use kserve lightgbm server locally, you will first need to install the lgbserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install lgbserver runtime. KServe uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/lgbserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/lightgbm/#serving-model-locally","text":"The lgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the lgbserver runtime package installed locally, you should now be ready to start our server as: python3 lgbserver --model_dir /path/to/model_dir --model_name lightgbm-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/lightgbm/#deploy-inferenceservice-with-rest-endpoint","text":"To deploy the LightGBM model with Open Inference Protocol, you need to set the protocolVersion field to v2 . Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris\" spec : predictor : model : modelFormat : name : lightgbm runtime : kserve-lgbserver protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f lightgbm-v2.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-v2-iris created","title":"Deploy InferenceService with REST endpoint"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model-with-curl","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/lightgbm-v2-iris/infer Expected Output { \"model_name\" : \"lightgbm-v2-iris\" , \"model_version\" : null , \"id\" : \"96253e27-83cf-4262-b279-1bd4b18d7922\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 2 , 3 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 8.796664107010673e-06 , 0.9992300031041593 , 0.0007612002317336916 , 4.974786820804187e-06 , 0.9999919650711493 , 3.0601420299625077e-06 ] } ] }","title":"Test the deployed model with curl"},{"location":"modelserving/v1beta1/lightgbm/#create-the-inferenceservice-with-grpc-endpoint","text":"Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f lightgbm-v2-grpc.yaml","title":"Create the InferenceService with gRPC endpoint"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follow the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"lightgbm-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Sun, 25 Sep 2022 10 :25:05 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 99 Estimated response size: 91 bytes Response contents: { \"modelName\" : \"lightgbm-v2-iris-grpc\" , \"outputs\" : [ { \"name\" : \"predict\" , \"datatype\" : \"FP64\" , \"shape\" : [ \"2\" , \"3\" ] , \"contents\" : { \"fp64Contents\" : [ 8 .796664107010673e-06, 0 .9992300031041593, 0 .0007612002317336916, 4 .974786820804187e-06, 0 .9999919650711493, 3 .0601420299625077e-06 ] } } ] }","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/llm/huggingface/","text":"Hugging Face LLM Serving Runtime \u00b6 The Hugging Face serving runtime implements two backends namely Hugging Face and vLLM that can serve Hugging Face models out of the box. The preprocess and post-process handlers are already implemented based on different ML tasks, for example text classification, token-classification, text-generation, text2text-generation, fill-mask. KServe Hugging Face runtime by default uses vLLM backend to serve text generation and text2text generation LLM models for faster time-to-first-token (TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as PagedAttention , continuous batching and an optimized CUDA kernel. If the model is not supported by the vLLM engine, KServe falls back to the Hugging Face backend as a failsafe. Supported ML Tasks \u00b6 The Hugging Face runtime supports the following ML tasks: Text Generation Text2Text Generation Fill Mask Token Classification Sequence Classification (Text Classification) For information on the models supported by the vLLM backend, please visit vLLM's documentation . API Endpoints \u00b6 Both of the backends support serving generative models (text generation and text2text generation) using OpenAI's Completion and Chat Completion API. The other types of tasks like token classification, sequence classification, and fill mask are served using KServe's Open Inference Protocol or V1 API . Examples \u00b6 The following examples demonstrate how to deploy and perform inference using the Hugging Face runtime with different ML tasks: Text Generation using LLama3 Text2Text Generation using T5 Token Classification using BERT Sequence Classification (Text Classification) using distilBERT Fill Mask using BERT SDK Integration Note The Hugging Face runtime image has the following environment variables set by default: SAFETENSORS_FAST_GPU is set by default to improve the model loading performance. HF_HUB_DISABLE_TELEMETRY is set by default to disable the telemetry. Hugging Face Runtime Arguments \u00b6 Below, you can find an explanation of command line arguments which are supported by the Hugging Face runtime. vLLM backend engine arguments can also be specified on the command line and will be parsed by the Hugging Face runtime. --model_name : The name of the model used on the endpoint path. --model_dir : The local path where the model is downloaded to. If model_id is provided, this argument will be ignored. --model_id : Hugging Face model id. --model_revision : Hugging Face model revision. --tokenizer_revision : Hugging Face tokenizer revision. --dtype : Data type to load the weights in. One of 'auto', 'float16', 'float32', 'bfloat16', 'float', 'half'. Defaults to float16 for GPU and float32 for CPU systems. 'auto' uses float16 if GPU is available and uses float32 otherwise to ensure consistency between vLLM and HuggingFace backends. Encoder models defaults to 'float32'. 'float' is shorthand for 'float32'. 'half' is 'float16'. The rest are as the name reads. --task : The ML task name. Can be one of 'text_generation', 'text2text_generation', 'fill_mask', 'token_classification', 'sequence_classification'. If not provided, model server will try infer the task from model architecture. --backend : The backend to use to load the model. Can be one of 'auto', 'huggingface', 'vllm'. --max_length : Max sequence length for the tokenizer. --disable_lower_case : Disable lower case for the tokenizer. --disable_special_tokens : The sequences will not be encoded with the special tokens relative to the model. --trust_remote_code : Allow loading of models and tokenizers with custom code. --tensor_input_names : The tensor input names passed to the model for triton inference server backend. --return_token_type_ids : Return token type ids. --return_probabilities : Return probabilities of predicted indexes. This is only applicable for tasks 'sequence_classification', 'token_classification' and 'fill_mask'.","title":"Overview"},{"location":"modelserving/v1beta1/llm/huggingface/#hugging-face-llm-serving-runtime","text":"The Hugging Face serving runtime implements two backends namely Hugging Face and vLLM that can serve Hugging Face models out of the box. The preprocess and post-process handlers are already implemented based on different ML tasks, for example text classification, token-classification, text-generation, text2text-generation, fill-mask. KServe Hugging Face runtime by default uses vLLM backend to serve text generation and text2text generation LLM models for faster time-to-first-token (TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as PagedAttention , continuous batching and an optimized CUDA kernel. If the model is not supported by the vLLM engine, KServe falls back to the Hugging Face backend as a failsafe.","title":"Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/#supported-ml-tasks","text":"The Hugging Face runtime supports the following ML tasks: Text Generation Text2Text Generation Fill Mask Token Classification Sequence Classification (Text Classification) For information on the models supported by the vLLM backend, please visit vLLM's documentation .","title":"Supported ML Tasks"},{"location":"modelserving/v1beta1/llm/huggingface/#api-endpoints","text":"Both of the backends support serving generative models (text generation and text2text generation) using OpenAI's Completion and Chat Completion API. The other types of tasks like token classification, sequence classification, and fill mask are served using KServe's Open Inference Protocol or V1 API .","title":"API Endpoints"},{"location":"modelserving/v1beta1/llm/huggingface/#examples","text":"The following examples demonstrate how to deploy and perform inference using the Hugging Face runtime with different ML tasks: Text Generation using LLama3 Text2Text Generation using T5 Token Classification using BERT Sequence Classification (Text Classification) using distilBERT Fill Mask using BERT SDK Integration Note The Hugging Face runtime image has the following environment variables set by default: SAFETENSORS_FAST_GPU is set by default to improve the model loading performance. HF_HUB_DISABLE_TELEMETRY is set by default to disable the telemetry.","title":"Examples"},{"location":"modelserving/v1beta1/llm/huggingface/#hugging-face-runtime-arguments","text":"Below, you can find an explanation of command line arguments which are supported by the Hugging Face runtime. vLLM backend engine arguments can also be specified on the command line and will be parsed by the Hugging Face runtime. --model_name : The name of the model used on the endpoint path. --model_dir : The local path where the model is downloaded to. If model_id is provided, this argument will be ignored. --model_id : Hugging Face model id. --model_revision : Hugging Face model revision. --tokenizer_revision : Hugging Face tokenizer revision. --dtype : Data type to load the weights in. One of 'auto', 'float16', 'float32', 'bfloat16', 'float', 'half'. Defaults to float16 for GPU and float32 for CPU systems. 'auto' uses float16 if GPU is available and uses float32 otherwise to ensure consistency between vLLM and HuggingFace backends. Encoder models defaults to 'float32'. 'float' is shorthand for 'float32'. 'half' is 'float16'. The rest are as the name reads. --task : The ML task name. Can be one of 'text_generation', 'text2text_generation', 'fill_mask', 'token_classification', 'sequence_classification'. If not provided, model server will try infer the task from model architecture. --backend : The backend to use to load the model. Can be one of 'auto', 'huggingface', 'vllm'. --max_length : Max sequence length for the tokenizer. --disable_lower_case : Disable lower case for the tokenizer. --disable_special_tokens : The sequences will not be encoded with the special tokens relative to the model. --trust_remote_code : Allow loading of models and tokenizers with custom code. --tensor_input_names : The tensor input names passed to the model for triton inference server backend. --return_token_type_ids : Return token type ids. --return_probabilities : Return probabilities of predicted indexes. This is only applicable for tasks 'sequence_classification', 'token_classification' and 'fill_mask'.","title":"Hugging Face Runtime Arguments"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/","text":"Deploy the BERT model for fill mask task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy BERT model for fill mask task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using V1 Protocol \u00b6 First, We will deploy the BERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=google-bert/bert-base-uncased resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"The capital of France is [MASK].\", \"The capital of [MASK] is paris.\"]}' Expected Output { \"predictions\" :[ \"paris\" , \"france\" ]} Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol) \u00b6 First, We will deploy the BERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=bert - --model_id=google-bert/bert-base-uncased resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"The capital of France is [MASK].\", \"The capital of [MASK] is paris.\"]}]}' Expected Output { \"model_name\" : \"bert\" , \"model_version\" : null , \"id\" : \"fd206443-f58c-4c5f-a04b-e6babcf6c854\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"paris\" , \"france\" ] } ] }","title":"Fill Mask"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#deploy-the-bert-model-for-fill-mask-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy BERT model for fill mask task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the BERT model for fill mask task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#serve-the-hugging-face-llm-model-using-v1-protocol","text":"First, We will deploy the BERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=google-bert/bert-base-uncased resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using V1 Protocol"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"The capital of France is [MASK].\", \"The capital of [MASK] is paris.\"]}' Expected Output { \"predictions\" :[ \"paris\" , \"france\" ]}","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#serve-the-hugging-face-llm-model-using-open-inference-protocolv2-protocol","text":"First, We will deploy the BERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=bert - --model_id=google-bert/bert-base-uncased resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol)"},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#check-inferenceservice-status_1","text":"kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/fill_mask/#perform-model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"The capital of France is [MASK].\", \"The capital of [MASK] is paris.\"]}]}' Expected Output { \"model_name\" : \"bert\" , \"model_version\" : null , \"id\" : \"fd206443-f58c-4c5f-a04b-e6babcf6c854\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"paris\" , \"france\" ] } ] }","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/","text":"Integrate KServe LLM Deployment with LLM SDKs \u00b6 This document provides the example of how to integrate KServe LLM Inference Service with the popular LLM SDKs. Deploy a KServe LLM Inference Service \u00b6 Please follow this example: Text Generation using LLama3 to deploy a KServe LLM Inference Service. Get the SERVICE_HOSTNAME by running the following command: SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The model name for the above example is llama3 . How to integrate with OpenAI SDK \u00b6 Install the OpenAI SDK : pip3 install openai Create a Python script to interact with the KServe LLM Inference Service and save it as sample_openai.py : python from openai import OpenAI Deployment_url = \"<SERVICE_HOSTNAME>\" client = OpenAI ( base_url = f \" { Deployment_url } /openai/v1\" , api_key = \"empty\" , ) # typial chat completion response print ( \"Typical chat completion response:\" ) response = client . chat . completions . create ( model = \"llama3\" , messages = [ { 'role' : 'user' , 'content' : \"What's 1+1? Answer in one word.\" } ], temperature = 0 , max_tokens = 256 ) reply = response . choices [ 0 ] . message print ( f \"Extracted reply: \\n { reply . content } \\n \" ) # streaming chat completion response print ( \"Streaming chat completion response:\" ) stream = client . chat . completions . create ( model = 'llama3' , messages = [ { 'role' : 'user' , 'content' : 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...' } ], temperature = 0 , max_tokens = 300 , stream = True # this time, we set stream=True ) for chunk in stream : print ( chunk . choices [ 0 ] . delta . content or \"\" , end = \"\" , flush = True ) Run the python script: python3 sample_openai.py Expected Output Typical chat completion response: Extracted reply: Two. Streaming chat completion response: 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 How to integrate with Langchain SDK \u00b6 Install the Langchain SDK : pip3 install langchain-openai Create a Python script to interact with the KServe LLM Inference Service and save it as sample_langchain.py : python from langchain_openai import ChatOpenAI Deployment_url = \"<SERVICE_HOSTNAME>\" llm = ChatOpenAI ( model_name = \"llama3\" , base_url = f \" { Deployment_url } /openai/v1\" , openai_api_key = \"empty\" , temperature = 0 , max_tokens = 256 , ) # typial chat completion response print ( \"Typical chat completion response:\" ) messages = [ ( \"system\" , \"You are a helpful assistant that translates English to French. Translate the user sentence.\" , ), ( \"human\" , \"I love programming.\" ), ] reply = llm . invoke ( messages ) print ( f \"Extracted reply: \\n { reply . content } \\n \" ) # streaming chat completion response print ( \"Streaming chat completion response:\" ) for chunk in llm . stream ( \"Write me a 1 verse song about goldfish on the moon\" ): print ( chunk . content , end = \"\" , flush = True ) Run the python script: python3 sample_langchain.py Expected Output Typical chat completion response: Extracted reply: Je adore le programmation. Streaming chat completion response: Here is a 1 -verse song about goldfish on the moon: \"In the lunar lake, where the craters shine A school of goldfish swim, in a celestial shrine Their scales glimmer bright, like stars in the night As they dart and play, in the moon's gentle light\"","title":"SDK Integration"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/#integrate-kserve-llm-deployment-with-llm-sdks","text":"This document provides the example of how to integrate KServe LLM Inference Service with the popular LLM SDKs.","title":"Integrate KServe LLM Deployment with LLM SDKs"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/#deploy-a-kserve-llm-inference-service","text":"Please follow this example: Text Generation using LLama3 to deploy a KServe LLM Inference Service. Get the SERVICE_HOSTNAME by running the following command: SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The model name for the above example is llama3 .","title":"Deploy a KServe LLM Inference Service"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/#how-to-integrate-with-openai-sdk","text":"Install the OpenAI SDK : pip3 install openai Create a Python script to interact with the KServe LLM Inference Service and save it as sample_openai.py : python from openai import OpenAI Deployment_url = \"<SERVICE_HOSTNAME>\" client = OpenAI ( base_url = f \" { Deployment_url } /openai/v1\" , api_key = \"empty\" , ) # typial chat completion response print ( \"Typical chat completion response:\" ) response = client . chat . completions . create ( model = \"llama3\" , messages = [ { 'role' : 'user' , 'content' : \"What's 1+1? Answer in one word.\" } ], temperature = 0 , max_tokens = 256 ) reply = response . choices [ 0 ] . message print ( f \"Extracted reply: \\n { reply . content } \\n \" ) # streaming chat completion response print ( \"Streaming chat completion response:\" ) stream = client . chat . completions . create ( model = 'llama3' , messages = [ { 'role' : 'user' , 'content' : 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...' } ], temperature = 0 , max_tokens = 300 , stream = True # this time, we set stream=True ) for chunk in stream : print ( chunk . choices [ 0 ] . delta . content or \"\" , end = \"\" , flush = True ) Run the python script: python3 sample_openai.py Expected Output Typical chat completion response: Extracted reply: Two. Streaming chat completion response: 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100","title":"How to integrate with OpenAI SDK"},{"location":"modelserving/v1beta1/llm/huggingface/sdk_integration/#how-to-integrate-with-langchain-sdk","text":"Install the Langchain SDK : pip3 install langchain-openai Create a Python script to interact with the KServe LLM Inference Service and save it as sample_langchain.py : python from langchain_openai import ChatOpenAI Deployment_url = \"<SERVICE_HOSTNAME>\" llm = ChatOpenAI ( model_name = \"llama3\" , base_url = f \" { Deployment_url } /openai/v1\" , openai_api_key = \"empty\" , temperature = 0 , max_tokens = 256 , ) # typial chat completion response print ( \"Typical chat completion response:\" ) messages = [ ( \"system\" , \"You are a helpful assistant that translates English to French. Translate the user sentence.\" , ), ( \"human\" , \"I love programming.\" ), ] reply = llm . invoke ( messages ) print ( f \"Extracted reply: \\n { reply . content } \\n \" ) # streaming chat completion response print ( \"Streaming chat completion response:\" ) for chunk in llm . stream ( \"Write me a 1 verse song about goldfish on the moon\" ): print ( chunk . content , end = \"\" , flush = True ) Run the python script: python3 sample_langchain.py Expected Output Typical chat completion response: Extracted reply: Je adore le programmation. Streaming chat completion response: Here is a 1 -verse song about goldfish on the moon: \"In the lunar lake, where the craters shine A school of goldfish swim, in a celestial shrine Their scales glimmer bright, like stars in the night As they dart and play, in the moon's gentle light\"","title":"How to integrate with Langchain SDK"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/","text":"Deploy the t5 model for text2text generation task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy t5 model for Text2Text Generation task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using HuggingFace Backend \u00b6 KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster time-to-first-token(TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as paged attention, continuous batching and an optimized CUDA kernel. If the model is not supported by vLLM, KServe falls back to HuggingFace backend as a failsafe. You can use --backend=huggingface argument to perform the inference using Hugging Face API. KServe Hugging Face backend runtime also supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference. Note At the time this document was written, the t5 model is not supported by the vLLM engine, so the runtime will automatically use the Hugging Face backend to serve the model. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-t5 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=t5 - --model_id=google-t5/t5-small - --backend=huggingface resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-t5 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-t5 http://huggingface-t5.default.example.com True 100 huggingface-t5-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-t5 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Sample OpenAI Completions request: \u00b6 curl -H \"content-type:application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -d '{\"model\": \"t5\", \"prompt\": \"translate English to German: The house is wonderful.\", \"stream\":false, \"max_tokens\": 30 }' Expected Output { \"id\" : \"de53f527-9cb9-47a5-9673-43d180b704f2\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Das Haus ist wunderbar.\" } ], \"created\" : 1717998661 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 7 , \"prompt_tokens\" : 11 , \"total_tokens\" : 18 } } Sample OpenAI Completions streaming request: \u00b6 curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -d '{\"model\": \"${MODEL_NAME}\", \"prompt\": \"translate English to German: The house is wonderful.\", \"stream\":true, \"max_tokens\": 30 }' Expected Output da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Das \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Haus \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"ist \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"wunderbar.</s>\" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : [ DONE ]","title":"Text2Text Generation"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#deploy-the-t5-model-for-text2text-generation-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy t5 model for Text2Text Generation task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the t5 model for text2text generation task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#serve-the-hugging-face-llm-model-using-huggingface-backend","text":"KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster time-to-first-token(TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as paged attention, continuous batching and an optimized CUDA kernel. If the model is not supported by vLLM, KServe falls back to HuggingFace backend as a failsafe. You can use --backend=huggingface argument to perform the inference using Hugging Face API. KServe Hugging Face backend runtime also supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference. Note At the time this document was written, the t5 model is not supported by the vLLM engine, so the runtime will automatically use the Hugging Face backend to serve the model. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-t5 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=t5 - --model_id=google-t5/t5-small - --backend=huggingface resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using HuggingFace Backend"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-t5 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-t5 http://huggingface-t5.default.example.com True 100 huggingface-t5-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-t5 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#sample-openai-completions-request","text":"curl -H \"content-type:application/json\" \\ -H \"Host: ${ SERVICE_HOSTNAME } \" -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -d '{\"model\": \"t5\", \"prompt\": \"translate English to German: The house is wonderful.\", \"stream\":false, \"max_tokens\": 30 }' Expected Output { \"id\" : \"de53f527-9cb9-47a5-9673-43d180b704f2\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Das Haus ist wunderbar.\" } ], \"created\" : 1717998661 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 7 , \"prompt_tokens\" : 11 , \"total_tokens\" : 18 } }","title":"Sample OpenAI Completions request:"},{"location":"modelserving/v1beta1/llm/huggingface/text2text_generation/#sample-openai-completions-streaming-request","text":"curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -d '{\"model\": \"${MODEL_NAME}\", \"prompt\": \"translate English to German: The house is wonderful.\", \"stream\":true, \"max_tokens\": 30 }' Expected Output da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Das \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Haus \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"ist \" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"70bb8bea-57d5-4b34-aade-da38970c917c\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"wunderbar.</s>\" }], \"created\" : 1717998767 , \"model\" : \"t5\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : [ DONE ]","title":"Sample OpenAI Completions streaming request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/","text":"Deploy the BERT model for text classification task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy distilBERT model for sequence classification (a.k.a. text classification) task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using V1 Protocol \u00b6 First, We will deploy the distilBERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-distilbert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=distilbert - --model_id=distilbert/distilbert-base-uncased-finetuned-sst-2-english resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-distilbert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-distilbert http://huggingface-distilbert.default.example.com True 100 huggingface-distilbert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = distilbert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-distilbert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"Hello, my dog is cute\", \"I am feeling sad\"]}' Expected Output { \"predictions\" :[ 1 , 0 ]} Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol) \u00b6 First, We will deploy the distilBERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-distilbert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=distilbert - --model_id=distilbert/distilbert-base-uncased-finetuned-sst-2-english resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-distilbert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-distilbert http://huggingface-distilbert.default.example.com True 100 huggingface-distilbert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = distilbert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-distilbert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"Hello, my dog is cute\", \"I am feeling sad\"]}]}' Expected Output { \"model_name\" : \"distilbert\" , \"model_version\" : null , \"id\" : \"e4bcfc28-e9f2-4c2a-b61f-c491e7346528\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 ], \"datatype\" : \"INT64\" , \"parameters\" : null , \"data\" : [ 1 , 0 ] } ] }","title":"Text Classification"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#deploy-the-bert-model-for-text-classification-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy distilBERT model for sequence classification (a.k.a. text classification) task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the BERT model for text classification task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#serve-the-hugging-face-llm-model-using-v1-protocol","text":"First, We will deploy the distilBERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-distilbert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=distilbert - --model_id=distilbert/distilbert-base-uncased-finetuned-sst-2-english resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using V1 Protocol"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-distilbert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-distilbert http://huggingface-distilbert.default.example.com True 100 huggingface-distilbert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = distilbert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-distilbert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"Hello, my dog is cute\", \"I am feeling sad\"]}' Expected Output { \"predictions\" :[ 1 , 0 ]}","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#serve-the-hugging-face-llm-model-using-open-inference-protocolv2-protocol","text":"First, We will deploy the distilBERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-distilbert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=distilbert - --model_id=distilbert/distilbert-base-uncased-finetuned-sst-2-english resources : limits : cpu : \"1\" memory : 4Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol)"},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#check-inferenceservice-status_1","text":"kubectl get inferenceservices huggingface-distilbert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-distilbert http://huggingface-distilbert.default.example.com True 100 huggingface-distilbert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text_classification/#perform-model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = distilbert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-distilbert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"Hello, my dog is cute\", \"I am feeling sad\"]}]}' Expected Output { \"model_name\" : \"distilbert\" , \"model_version\" : null , \"id\" : \"e4bcfc28-e9f2-4c2a-b61f-c491e7346528\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 ], \"datatype\" : \"INT64\" , \"parameters\" : null , \"data\" : [ 1 , 0 ] } ] }","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/","text":"Deploy the Llama3 model for text_generation task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy Llama3 model for text generation task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using vLLM backend \u00b6 KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster time-to-first-token(TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as paged attention, continuous batching and an optimized CUDA kernel. If the model is not supported by vLLM, KServe falls back to HuggingFace backend as a failsafe. Note The Llama3 model requires huggingface hub token to download the model. You can set the token using HF_TOKEN environment variable. Create a secret with the Hugging Face token. Yaml apiVersion : v1 kind : Secret metadata : name : hf-secret type : Opaque stringData : HF_TOKEN : <token> Then create the inference service. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct env : - name : HF_TOKEN valueFrom : secretKeyRef : name : hf-secret key : HF_TOKEN optional : false resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-llama3 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-llama3 http://huggingface-llama3.default.example.com True 100 huggingface-llama3-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = llama3 SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) KServe Hugging Face vLLM runtime supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference Sample OpenAI Completions request: \u00b6 curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":false, \"max_tokens\": 30}' Expected Output { \"id\" : \"cmpl-625a9240f25e463487a9b6c53cbed080\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \" and how they make you feel\\nColors, oh colors, so vibrant and bright\\nA world of emotions, a kaleidoscope in sight\\nRed\" } ], \"created\" : 1718620153 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 6 , \"total_tokens\" : 36 } } Sample OpenAI Chat request: \u00b6 curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":false}' Expected Output { \"id\" : \"cmpl-9aad539128294069bf1e406a5cba03d3\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"message\" : { \"content\" : \" O, fair and vibrant colors, how ye doth delight\\nIn the world around us, with thy hues so bright!\\n\" , \"tool_calls\" : null , \"role\" : \"assistant\" , \"function_call\" : null }, \"logprobs\" : null } ], \"created\" : 1718638005 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 37 , \"total_tokens\" : 67 } } Sample OpenAI Chat Completions streaming request: \u00b6 curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":true}' Note The output is truncated for brevity. Expected Output da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" \" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" O\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \",\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \"skie\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \",\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" what\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : [ DONE ] Serve the Hugging Face LLM model using HuggingFace Backend \u00b6 You can use --backend=huggingface argument to perform the inference using Hugging Face API. KServe Hugging Face backend runtime also supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference. Note The Llama3 model requires huggingface hub token to download the model. You can set the token using HF_TOKEN environment variable. Create a secret with the Hugging Face token. Yaml apiVersion : v1 kind : Secret metadata : name : hf-secret type : Opaque stringData : HF_TOKEN : <token> Then create the inference service. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct - --backend=huggingface env : - name : HF_TOKEN valueFrom : secretKeyRef : name : hf-secret key : HF_TOKEN optional : false resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-llama3 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-llama3 http://huggingface-llama3.default.example.com True 100 huggingface-llama3-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = llama3 SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Sample OpenAI Completions request: \u00b6 curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":false, \"max_tokens\": 30}' Expected Output { \"id\" : \"564d3bcf-5569-4d15-ace4-ed8a29678359\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"\\nColors, oh colors, so vibrant and bright\\nA world of emotions, a world in sight\\nRed, the passion, the fire that burns\" } ], \"created\" : 1718699758 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 6 , \"total_tokens\" : 36 } } Sample OpenAI Chat Completions request: \u00b6 curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":false}' Expected Output { \"id\" : \"7dcc83b4-aa94-4a52-90fd-fa705978d3c1\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"message\" : { \"content\" : \"assistant\\n\\nO, fairest hues of earth and sky,\\nHow oft thy beauty doth my senses fly!\\nIn vibrant splendor, thou\" , \"tool_calls\" : null , \"role\" : \"assistant\" , \"function_call\" : null }, \"logprobs\" : null } ], \"created\" : 1718699982 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 26 , \"total_tokens\" : 56 } } Sample OpenAI Completions streaming request: \u00b6 curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":true, \"max_tokens\": 30}' Note The output is truncated for brevity. Expected Output da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"\\n\" }], \"created\" : 1718700166 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Colors, \" }], \"created\" : 1718700168 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"oh \" }], \"created\" : 1718700169 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"colors, \" }], \"created\" : 1718700170 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : [ DONE ]","title":"Text Generation"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#deploy-the-llama3-model-for-text_generation-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy Llama3 model for text generation task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the Llama3 model for text_generation task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#serve-the-hugging-face-llm-model-using-vllm-backend","text":"KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster time-to-first-token(TTFT) and higher token generation throughput than the Hugging Face API. vLLM is implemented with common inference optimization techniques, such as paged attention, continuous batching and an optimized CUDA kernel. If the model is not supported by vLLM, KServe falls back to HuggingFace backend as a failsafe. Note The Llama3 model requires huggingface hub token to download the model. You can set the token using HF_TOKEN environment variable. Create a secret with the Hugging Face token. Yaml apiVersion : v1 kind : Secret metadata : name : hf-secret type : Opaque stringData : HF_TOKEN : <token> Then create the inference service. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct env : - name : HF_TOKEN valueFrom : secretKeyRef : name : hf-secret key : HF_TOKEN optional : false resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using vLLM backend"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-llama3 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-llama3 http://huggingface-llama3.default.example.com True 100 huggingface-llama3-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = llama3 SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) KServe Hugging Face vLLM runtime supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-completions-request","text":"curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":false, \"max_tokens\": 30}' Expected Output { \"id\" : \"cmpl-625a9240f25e463487a9b6c53cbed080\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \" and how they make you feel\\nColors, oh colors, so vibrant and bright\\nA world of emotions, a kaleidoscope in sight\\nRed\" } ], \"created\" : 1718620153 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 6 , \"total_tokens\" : 36 } }","title":"Sample OpenAI Completions request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-chat-request","text":"curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":false}' Expected Output { \"id\" : \"cmpl-9aad539128294069bf1e406a5cba03d3\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"message\" : { \"content\" : \" O, fair and vibrant colors, how ye doth delight\\nIn the world around us, with thy hues so bright!\\n\" , \"tool_calls\" : null , \"role\" : \"assistant\" , \"function_call\" : null }, \"logprobs\" : null } ], \"created\" : 1718638005 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 37 , \"total_tokens\" : 67 } }","title":"Sample OpenAI Chat request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-chat-completions-streaming-request","text":"curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":true}' Note The output is truncated for brevity. Expected Output da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" \" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" O\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \",\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \"skie\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \",\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : { \"id\" : \"cmpl-22e12eb9fa5e4b0c9726cef4a9ac993c\" , \"choices\" :[{ \"delta\" :{ \"content\" : \" what\" , \"function_call\" : null , \"tool_calls\" : null , \"role\" : \"assistant\" }, \"logprobs\" : null , \"finish_reason\" : null , \"index\" : 0 }], \"created\" : 1718638726 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion.chunk\" } da ta : [ DONE ]","title":"Sample OpenAI Chat Completions streaming request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#serve-the-hugging-face-llm-model-using-huggingface-backend","text":"You can use --backend=huggingface argument to perform the inference using Hugging Face API. KServe Hugging Face backend runtime also supports the OpenAI /v1/completions and /v1/chat/completions endpoints for inference. Note The Llama3 model requires huggingface hub token to download the model. You can set the token using HF_TOKEN environment variable. Create a secret with the Hugging Face token. Yaml apiVersion : v1 kind : Secret metadata : name : hf-secret type : Opaque stringData : HF_TOKEN : <token> Then create the inference service. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-llama3 spec : predictor : model : modelFormat : name : huggingface args : - --model_name=llama3 - --model_id=meta-llama/meta-llama-3-8b-instruct - --backend=huggingface env : - name : HF_TOKEN valueFrom : secretKeyRef : name : hf-secret key : HF_TOKEN optional : false resources : limits : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" requests : cpu : \"6\" memory : 24Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using HuggingFace Backend"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#check-inferenceservice-status_1","text":"kubectl get inferenceservices huggingface-llama3 Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-llama3 http://huggingface-llama3.default.example.com True 100 huggingface-llama3-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#perform-model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = llama3 SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-llama3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-completions-request_1","text":"curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":false, \"max_tokens\": 30}' Expected Output { \"id\" : \"564d3bcf-5569-4d15-ace4-ed8a29678359\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"\\nColors, oh colors, so vibrant and bright\\nA world of emotions, a world in sight\\nRed, the passion, the fire that burns\" } ], \"created\" : 1718699758 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 6 , \"total_tokens\" : 36 } }","title":"Sample OpenAI Completions request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-chat-completions-request","text":"curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/chat/completions \\ -d '{\"model\":\"llama3\",\"messages\":[{\"role\":\"system\",\"content\":\"You are an assistant that speaks like Shakespeare.\"},{\"role\":\"user\",\"content\":\"Write a poem about colors\"}],\"max_tokens\":30,\"stream\":false}' Expected Output { \"id\" : \"7dcc83b4-aa94-4a52-90fd-fa705978d3c1\" , \"choices\" : [ { \"finish_reason\" : \"length\" , \"index\" : 0 , \"message\" : { \"content\" : \"assistant\\n\\nO, fairest hues of earth and sky,\\nHow oft thy beauty doth my senses fly!\\nIn vibrant splendor, thou\" , \"tool_calls\" : null , \"role\" : \"assistant\" , \"function_call\" : null }, \"logprobs\" : null } ], \"created\" : 1718699982 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"chat.completion\" , \"usage\" : { \"completion_tokens\" : 30 , \"prompt_tokens\" : 26 , \"total_tokens\" : 56 } }","title":"Sample OpenAI Chat Completions request:"},{"location":"modelserving/v1beta1/llm/huggingface/text_generation/#sample-openai-completions-streaming-request","text":"curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /openai/v1/completions \\ -H \"content-type: application/json\" \\ -d '{\"model\": \"llama3\", \"prompt\": \"Write a poem about colors\", \"stream\":true, \"max_tokens\": 30}' Note The output is truncated for brevity. Expected Output da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"\\n\" }], \"created\" : 1718700166 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"Colors, \" }], \"created\" : 1718700168 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"oh \" }], \"created\" : 1718700169 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : { \"id\" : \"acadb7d0-1235-4cd7-bd7b-24c62a89b8de\" , \"choices\" :[{ \"finish_reason\" : \"length\" , \"index\" : 0 , \"logprobs\" : null , \"text\" : \"colors, \" }], \"created\" : 1718700170 , \"model\" : \"llama3\" , \"system_fingerprint\" : null , \"object\" : \"text_completion\" , \"usage\" : null } da ta : [ DONE ]","title":"Sample OpenAI Completions streaming request:"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/","text":"Deploy the BERT model for token classification task with Hugging Face LLM Serving Runtime \u00b6 In this example, We demonstrate how to deploy BERT model for token classification task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime . Serve the Hugging Face LLM model using V1 Protocol \u00b6 First, We will deploy the BERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=dslim/bert-base-NER - --disable_lower_case resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"My name is Wolfgang and I live in Berlin\", \"My name is Lisa and I live in Paris\"]}' Expected Output { \"predictions\" :[[[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ]],[[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ]]]} Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol) \u00b6 First, We will deploy the BERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=bert - --model_id=dslim/bert-base-NER - --disable_lower_case resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF Check InferenceService status. \u00b6 kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"My name is Wolfgang and I live in Berlin\", \"My name is Lisa and I live in Paris\"]}]}' Expected Output { \"model_name\" : \"bert\" , \"model_version\" : null , \"id\" : \"3117e54b-8a6a-4072-9d87-6d7bdfe05eed\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 , 1 , 11 ], \"datatype\" : \"INT64\" , \"parameters\" : null , \"data\" :[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ] } ] }","title":"Token Classification"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#deploy-the-bert-model-for-token-classification-task-with-hugging-face-llm-serving-runtime","text":"In this example, We demonstrate how to deploy BERT model for token classification task from Hugging Face by deploying the InferenceService with Hugging Face Serving runtime .","title":"Deploy the BERT model for token classification task with Hugging Face LLM Serving Runtime"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#serve-the-hugging-face-llm-model-using-v1-protocol","text":"First, We will deploy the BERT model using the Hugging Face backend with V1 Protocol. Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface args : - --model_name=bert - --model_id=dslim/bert-base-NER - --disable_lower_case resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using V1 Protocol"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#check-inferenceservice-status","text":"kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"instances\": [\"My name is Wolfgang and I live in Berlin\", \"My name is Lisa and I live in Paris\"]}' Expected Output { \"predictions\" :[[[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ]],[[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ]]]}","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#serve-the-hugging-face-llm-model-using-open-inference-protocolv2-protocol","text":"First, We will deploy the BERT model using the Hugging Face backend with Open Inference Protocol(V2 Protocol). For this, We need to set the protocolVersion field to v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-bert spec : predictor : model : modelFormat : name : huggingface protocolVersion : v2 args : - --model_name=bert - --model_id=dslim/bert-base-NER - --disable_lower_case resources : limits : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 2Gi nvidia.com/gpu : \"1\" EOF","title":"Serve the Hugging Face LLM model using Open Inference Protocol(V2 Protocol)"},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#check-inferenceservice-status_1","text":"kubectl get inferenceservices huggingface-bert Expected Output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE huggingface-bert http://huggingface-bert.default.example.com True 100 huggingface-bert-predictor-default-47q2g 7d23h","title":"Check InferenceService status."},{"location":"modelserving/v1beta1/llm/huggingface/token_classification/#perform-model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-bert -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer \\ -H \"content-type: application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d '{\"inputs\": [{\"name\": \"input-0\", \"shape\": [2], \"datatype\": \"BYTES\", \"data\": [\"My name is Wolfgang and I live in Berlin\", \"My name is Lisa and I live in Paris\"]}]}' Expected Output { \"model_name\" : \"bert\" , \"model_version\" : null , \"id\" : \"3117e54b-8a6a-4072-9d87-6d7bdfe05eed\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"output-0\" , \"shape\" : [ 2 , 1 , 11 ], \"datatype\" : \"INT64\" , \"parameters\" : null , \"data\" :[ 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 0 , 7 , 0 ] } ] }","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/","text":"Serve Large Language Model with Huggingface Accelerate \u00b6 This documentation explains how KServe supports large language model serving via TorchServe . The large language refers to the models that are not able to fit into a single GPU, and they need to be sharded onto multiple partitions over multiple GPUs. Huggingface Accelerate can load sharded checkpoints and the maximum RAM usage will be the size of the largest shard. By setting device_map to true, Accelerate automatically determines where to put each layer of the model depending on the available resources. Package the model \u00b6 Download the model bigscience/bloom-7b1 from Huggingface Hub by running python Download_model.py --model_name bigscience/bloom-7b1 Compress the model zip -r model.zip model/models--bigscience-bloom-7b1/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/ Package the model Create the setup_config.json file with accelerate settings: Enable low_cpu_mem_usage to use accelerate Recommended max_memory in setup_config.json is the max size of shard. { \"revision\" : \"main\" , \"max_memory\" : { \"0\" : \"10GB\" , \"cpu\" : \"10GB\" }, \"low_cpu_mem_usage\" : true , \"device_map\" : \"auto\" , \"offload_folder\" : \"offload\" , \"offload_state_dict\" : true , \"torch_dtype\" : \"float16\" , \"max_length\" : \"80\" } torch-model-archiver --model-name bloom7b1 --version 1 .0 --handler custom_handler.py --extra-files model.zip,setup_config.json Upload to your cloud storage, or you can use the uploaded bloom model from KServe GCS bucket. Serve the large language model with InferenceService \u00b6 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"bloom7b1\" spec : predictor : pytorch : runtimeVersion : 0.8.2 storageUri : gs://kfserving-examples/models/torchserve/llm/Huggingface_accelerate/bloom resources : limits : cpu : \"2\" memory : 32Gi nvidia.com/gpu : \"2\" requests : cpu : \"2\" memory : 32Gi nvidia.com/gpu : \"2\" Run the Inference \u00b6 Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice bloom7b1 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./text.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/bloom7b1:predict { \"predictions\" : [ \"My dog is cute.\\nNice.\\n- Hey, Mom.\\n- Yeah?\\nWhat color's your dog?\\n- It's gray.\\n- Gray?\\nYeah.\\nIt looks gray to me.\\n- Where'd you get it?\\n- Well, Dad says it's kind of...\\n- Gray?\\n- Gray.\\nYou got a gray dog?\\n- It's gray.\\n- Gray.\\nIs your dog gray?\\nAre you sure?\\nNo.\\nYou sure\" ]}","title":"TorchServe LLM"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/#serve-large-language-model-with-huggingface-accelerate","text":"This documentation explains how KServe supports large language model serving via TorchServe . The large language refers to the models that are not able to fit into a single GPU, and they need to be sharded onto multiple partitions over multiple GPUs. Huggingface Accelerate can load sharded checkpoints and the maximum RAM usage will be the size of the largest shard. By setting device_map to true, Accelerate automatically determines where to put each layer of the model depending on the available resources.","title":"Serve Large Language Model with Huggingface Accelerate"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/#package-the-model","text":"Download the model bigscience/bloom-7b1 from Huggingface Hub by running python Download_model.py --model_name bigscience/bloom-7b1 Compress the model zip -r model.zip model/models--bigscience-bloom-7b1/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/ Package the model Create the setup_config.json file with accelerate settings: Enable low_cpu_mem_usage to use accelerate Recommended max_memory in setup_config.json is the max size of shard. { \"revision\" : \"main\" , \"max_memory\" : { \"0\" : \"10GB\" , \"cpu\" : \"10GB\" }, \"low_cpu_mem_usage\" : true , \"device_map\" : \"auto\" , \"offload_folder\" : \"offload\" , \"offload_state_dict\" : true , \"torch_dtype\" : \"float16\" , \"max_length\" : \"80\" } torch-model-archiver --model-name bloom7b1 --version 1 .0 --handler custom_handler.py --extra-files model.zip,setup_config.json Upload to your cloud storage, or you can use the uploaded bloom model from KServe GCS bucket.","title":"Package the model"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/#serve-the-large-language-model-with-inferenceservice","text":"apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"bloom7b1\" spec : predictor : pytorch : runtimeVersion : 0.8.2 storageUri : gs://kfserving-examples/models/torchserve/llm/Huggingface_accelerate/bloom resources : limits : cpu : \"2\" memory : 32Gi nvidia.com/gpu : \"2\" requests : cpu : \"2\" memory : 32Gi nvidia.com/gpu : \"2\"","title":"Serve the large language model with InferenceService"},{"location":"modelserving/v1beta1/llm/torchserve/accelerate/#run-the-inference","text":"Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice bloom7b1 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./text.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/bloom7b1:predict { \"predictions\" : [ \"My dog is cute.\\nNice.\\n- Hey, Mom.\\n- Yeah?\\nWhat color's your dog?\\n- It's gray.\\n- Gray?\\nYeah.\\nIt looks gray to me.\\n- Where'd you get it?\\n- Well, Dad says it's kind of...\\n- Gray?\\n- Gray.\\nYou got a gray dog?\\n- It's gray.\\n- Gray.\\nIs your dog gray?\\nAre you sure?\\nNo.\\nYou sure\" ]}","title":"Run the Inference"},{"location":"modelserving/v1beta1/llm/vllm/","text":"vLLM Runtime \u00b6 The official vLLM support is available through Hugging Face Serving Runtime .","title":"Index"},{"location":"modelserving/v1beta1/llm/vllm/#vllm-runtime","text":"The official vLLM support is available through Hugging Face Serving Runtime .","title":"vLLM Runtime"},{"location":"modelserving/v1beta1/mlflow/v2/","text":"Deploy MLflow models with InferenceService \u00b6 This example walks you through how to deploy a mlflow model leveraging the KServe InferenceService CRD and how to send the inference request using V2 Dataplane . Training \u00b6 The first step is to train a sample sklearn model and save as mlflow model format by calling mlflow log_model API. # Original source code and more details can be found in: # https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html # The data set used in this example is from # http://archive.ics.uci.edu/ml/datasets/Wine+Quality # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. # Modeling wine preferences by data mining from physicochemical properties. # In Decision Support Systems, Elsevier, 47(4):547-553, 2009. import warnings import sys import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score from sklearn.model_selection import train_test_split from sklearn.linear_model import ElasticNet from urllib.parse import urlparse import mlflow import mlflow.sklearn from mlflow.models.signature import infer_signature import logging logging . basicConfig ( level = logging . WARN ) logger = logging . getLogger ( __name__ ) def eval_metrics ( actual , pred ): rmse = np . sqrt ( mean_squared_error ( actual , pred )) mae = mean_absolute_error ( actual , pred ) r2 = r2_score ( actual , pred ) return rmse , mae , r2 if __name__ == \"__main__\" : warnings . filterwarnings ( \"ignore\" ) np . random . seed ( 40 ) # Read the wine-quality csv file from the URL csv_url = ( \"http://archive.ics.uci.edu/ml\" \"/machine-learning-databases/wine-quality/winequality-red.csv\" ) try : data = pd . read_csv ( csv_url , sep = \";\" ) except Exception as e : logger . exception ( \"Unable to download training & test CSV, \" \"check your internet connection. Error: %s \" , e , ) # Split the data into training and test sets. (0.75, 0.25) split. train , test = train_test_split ( data ) # The predicted column is \"quality\" which is a scalar from [3, 9] train_x = train . drop ([ \"quality\" ], axis = 1 ) test_x = test . drop ([ \"quality\" ], axis = 1 ) train_y = train [[ \"quality\" ]] test_y = test [[ \"quality\" ]] alpha = float ( sys . argv [ 1 ]) if len ( sys . argv ) > 1 else 0.5 l1_ratio = float ( sys . argv [ 2 ]) if len ( sys . argv ) > 2 else 0.5 with mlflow . start_run (): lr = ElasticNet ( alpha = alpha , l1_ratio = l1_ratio , random_state = 42 ) lr . fit ( train_x , train_y ) predicted_qualities = lr . predict ( test_x ) ( rmse , mae , r2 ) = eval_metrics ( test_y , predicted_qualities ) print ( \"Elasticnet model (alpha= %f , l1_ratio= %f ):\" % ( alpha , l1_ratio )) print ( \" RMSE: %s \" % rmse ) print ( \" MAE: %s \" % mae ) print ( \" R2: %s \" % r2 ) mlflow . log_param ( \"alpha\" , alpha ) mlflow . log_param ( \"l1_ratio\" , l1_ratio ) mlflow . log_metric ( \"rmse\" , rmse ) mlflow . log_metric ( \"r2\" , r2 ) mlflow . log_metric ( \"mae\" , mae ) tracking_url_type_store = urlparse ( mlflow . get_tracking_uri ()) . scheme model_signature = infer_signature ( train_x , train_y ) # Model registry does not work with file store if tracking_url_type_store != \"file\" : # Register the model # There are other ways to use the Model Registry, # which depends on the use case, # please refer to the doc for more information: # https://mlflow.org/docs/latest/model-registry.html#api-workflow mlflow . sklearn . log_model ( lr , \"model\" , registered_model_name = \"ElasticnetWineModel\" , signature = model_signature , ) else : mlflow . sklearn . log_model ( lr , \"model\" , signature = model_signature ) The training script will also serialise our trained model, leveraging the MLflow Model format. model/ \u251c\u2500\u2500 MLmodel \u251c\u2500\u2500 model.pkl \u251c\u2500\u2500 conda.yaml \u2514\u2500\u2500 requirements.txt Testing locally \u00b6 Once you've got your model serialised model.pkl , we can then use MLServer to spin up a local server. For more details on MLServer, feel free to check the MLflow example doc . Note this step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use MLServer locally, you will first need to install the mlserver package in your local environment, as well as the MLflow runtime. pip install mlserver mlserver-mlflow Model settings \u00b6 The next step will be providing some model settings so that MLServer knows: The inference runtime to serve your model (i.e. mlserver_mlflow.MLflowRuntime ) The model's name and version These can be specified through environment variables or by creating a local model-settings.json file: { \"name\" : \"mlflow-wine-classifier\" , \"version\" : \"v1.0.0\" , \"implementation\" : \"mlserver_mlflow.MLflowRuntime\" } Start the model server locally \u00b6 With the mlserver package installed locally and a local model-settings.json file, you should now be ready to start our server as: mlserver start . Deploy with InferenceService \u00b6 When you deploy the model with InferenceService, KServe injects sensible defaults so that it runs out-of-the-box without any further configuration. However, you can still override these defaults by providing a model-settings.json file similar to your local one. You can even provide a set of model-settings.json files to load multiple models . To use v2 protocol for inference with the deployed model you set the protocolVersion field to v2 , in this example your model artifacts have already been uploaded to a \"GCS model repository\" and can be accessed as gs://kfserving-examples/models/mlflow/wine . New Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mlflow-v2-wine-classifier\" spec : predictor : model : modelFormat : name : mlflow protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/mlflow/wine\" kubectl kubectl apply -f mlflow-new.yaml Testing deployed model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below: { \"parameters\" : { \"content_type\" : \"pd\" }, \"inputs\" : [ { \"name\" : \"fixed acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 7.4 ] }, { \"name\" : \"volatile acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.7000 ] }, { \"name\" : \"citric acid\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0 ] }, { \"name\" : \"residual sugar\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 1.9 ] }, { \"name\" : \"chlorides\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.076 ] }, { \"name\" : \"free sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 11 ] }, { \"name\" : \"total sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 34 ] }, { \"name\" : \"density\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.9978 ] }, { \"name\" : \"pH\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 3.51 ] }, { \"name\" : \"sulphates\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.56 ] }, { \"name\" : \"alcohol\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 9.4 ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice mlflow-v2-wine-classifier -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./mlflow-input.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mlflow-v2-wine-classifier/infer Expected Output { \"model_name\" : \"mlflow-v2-wine-classifier\" , \"model_version\" : null , \"id\" : \"699cf11c-e843-444e-9dc3-000d991052cc\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 1 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" :[ 5.576883936610762 ] } ] }","title":"MLFlow"},{"location":"modelserving/v1beta1/mlflow/v2/#deploy-mlflow-models-with-inferenceservice","text":"This example walks you through how to deploy a mlflow model leveraging the KServe InferenceService CRD and how to send the inference request using V2 Dataplane .","title":"Deploy MLflow models with InferenceService"},{"location":"modelserving/v1beta1/mlflow/v2/#training","text":"The first step is to train a sample sklearn model and save as mlflow model format by calling mlflow log_model API. # Original source code and more details can be found in: # https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html # The data set used in this example is from # http://archive.ics.uci.edu/ml/datasets/Wine+Quality # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. # Modeling wine preferences by data mining from physicochemical properties. # In Decision Support Systems, Elsevier, 47(4):547-553, 2009. import warnings import sys import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score from sklearn.model_selection import train_test_split from sklearn.linear_model import ElasticNet from urllib.parse import urlparse import mlflow import mlflow.sklearn from mlflow.models.signature import infer_signature import logging logging . basicConfig ( level = logging . WARN ) logger = logging . getLogger ( __name__ ) def eval_metrics ( actual , pred ): rmse = np . sqrt ( mean_squared_error ( actual , pred )) mae = mean_absolute_error ( actual , pred ) r2 = r2_score ( actual , pred ) return rmse , mae , r2 if __name__ == \"__main__\" : warnings . filterwarnings ( \"ignore\" ) np . random . seed ( 40 ) # Read the wine-quality csv file from the URL csv_url = ( \"http://archive.ics.uci.edu/ml\" \"/machine-learning-databases/wine-quality/winequality-red.csv\" ) try : data = pd . read_csv ( csv_url , sep = \";\" ) except Exception as e : logger . exception ( \"Unable to download training & test CSV, \" \"check your internet connection. Error: %s \" , e , ) # Split the data into training and test sets. (0.75, 0.25) split. train , test = train_test_split ( data ) # The predicted column is \"quality\" which is a scalar from [3, 9] train_x = train . drop ([ \"quality\" ], axis = 1 ) test_x = test . drop ([ \"quality\" ], axis = 1 ) train_y = train [[ \"quality\" ]] test_y = test [[ \"quality\" ]] alpha = float ( sys . argv [ 1 ]) if len ( sys . argv ) > 1 else 0.5 l1_ratio = float ( sys . argv [ 2 ]) if len ( sys . argv ) > 2 else 0.5 with mlflow . start_run (): lr = ElasticNet ( alpha = alpha , l1_ratio = l1_ratio , random_state = 42 ) lr . fit ( train_x , train_y ) predicted_qualities = lr . predict ( test_x ) ( rmse , mae , r2 ) = eval_metrics ( test_y , predicted_qualities ) print ( \"Elasticnet model (alpha= %f , l1_ratio= %f ):\" % ( alpha , l1_ratio )) print ( \" RMSE: %s \" % rmse ) print ( \" MAE: %s \" % mae ) print ( \" R2: %s \" % r2 ) mlflow . log_param ( \"alpha\" , alpha ) mlflow . log_param ( \"l1_ratio\" , l1_ratio ) mlflow . log_metric ( \"rmse\" , rmse ) mlflow . log_metric ( \"r2\" , r2 ) mlflow . log_metric ( \"mae\" , mae ) tracking_url_type_store = urlparse ( mlflow . get_tracking_uri ()) . scheme model_signature = infer_signature ( train_x , train_y ) # Model registry does not work with file store if tracking_url_type_store != \"file\" : # Register the model # There are other ways to use the Model Registry, # which depends on the use case, # please refer to the doc for more information: # https://mlflow.org/docs/latest/model-registry.html#api-workflow mlflow . sklearn . log_model ( lr , \"model\" , registered_model_name = \"ElasticnetWineModel\" , signature = model_signature , ) else : mlflow . sklearn . log_model ( lr , \"model\" , signature = model_signature ) The training script will also serialise our trained model, leveraging the MLflow Model format. model/ \u251c\u2500\u2500 MLmodel \u251c\u2500\u2500 model.pkl \u251c\u2500\u2500 conda.yaml \u2514\u2500\u2500 requirements.txt","title":"Training"},{"location":"modelserving/v1beta1/mlflow/v2/#testing-locally","text":"Once you've got your model serialised model.pkl , we can then use MLServer to spin up a local server. For more details on MLServer, feel free to check the MLflow example doc . Note this step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Testing locally"},{"location":"modelserving/v1beta1/mlflow/v2/#pre-requisites","text":"Firstly, to use MLServer locally, you will first need to install the mlserver package in your local environment, as well as the MLflow runtime. pip install mlserver mlserver-mlflow","title":"Pre-requisites"},{"location":"modelserving/v1beta1/mlflow/v2/#model-settings","text":"The next step will be providing some model settings so that MLServer knows: The inference runtime to serve your model (i.e. mlserver_mlflow.MLflowRuntime ) The model's name and version These can be specified through environment variables or by creating a local model-settings.json file: { \"name\" : \"mlflow-wine-classifier\" , \"version\" : \"v1.0.0\" , \"implementation\" : \"mlserver_mlflow.MLflowRuntime\" }","title":"Model settings"},{"location":"modelserving/v1beta1/mlflow/v2/#start-the-model-server-locally","text":"With the mlserver package installed locally and a local model-settings.json file, you should now be ready to start our server as: mlserver start .","title":"Start the model server locally"},{"location":"modelserving/v1beta1/mlflow/v2/#deploy-with-inferenceservice","text":"When you deploy the model with InferenceService, KServe injects sensible defaults so that it runs out-of-the-box without any further configuration. However, you can still override these defaults by providing a model-settings.json file similar to your local one. You can even provide a set of model-settings.json files to load multiple models . To use v2 protocol for inference with the deployed model you set the protocolVersion field to v2 , in this example your model artifacts have already been uploaded to a \"GCS model repository\" and can be accessed as gs://kfserving-examples/models/mlflow/wine . New Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mlflow-v2-wine-classifier\" spec : predictor : model : modelFormat : name : mlflow protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/mlflow/wine\" kubectl kubectl apply -f mlflow-new.yaml","title":"Deploy with InferenceService"},{"location":"modelserving/v1beta1/mlflow/v2/#testing-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below: { \"parameters\" : { \"content_type\" : \"pd\" }, \"inputs\" : [ { \"name\" : \"fixed acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 7.4 ] }, { \"name\" : \"volatile acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.7000 ] }, { \"name\" : \"citric acid\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0 ] }, { \"name\" : \"residual sugar\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 1.9 ] }, { \"name\" : \"chlorides\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.076 ] }, { \"name\" : \"free sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 11 ] }, { \"name\" : \"total sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 34 ] }, { \"name\" : \"density\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.9978 ] }, { \"name\" : \"pH\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 3.51 ] }, { \"name\" : \"sulphates\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.56 ] }, { \"name\" : \"alcohol\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 9.4 ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice mlflow-v2-wine-classifier -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./mlflow-input.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mlflow-v2-wine-classifier/infer Expected Output { \"model_name\" : \"mlflow-v2-wine-classifier\" , \"model_version\" : null , \"id\" : \"699cf11c-e843-444e-9dc3-000d991052cc\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 1 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" :[ 5.576883936610762 ] } ] }","title":"Testing deployed model"},{"location":"modelserving/v1beta1/onnx/","text":"Deploy InferenceService with ONNX model \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : model : protocolVersion : v2 modelFormat : name : onnx storageUri : \"gs://kfserving-examples/models/onnx\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : onnx : storageUri : \"gs://kfserving-examples/models/onnx\" Note For the default kserve installation, While using new schema, you must specify protocolVersion as v2 for onnx. Otherwise, you will get a no runtime found error. Expected Output $ inferenceservice.serving.kserve.io/style-sample configured Run a sample inference \u00b6 Setup env vars The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT export ISVC_NAME=style-sample export SERVICE_HOSTNAME=$(kubectl get inferenceservice ${ISVC_NAME} -o jsonpath='{.status.url}' | cut -d \"/\" -f 3) 2. Verify the service is healthy curl -v -H \"Host:${SERVICE_HOSTNAME}\" http://localhost:8080//v2/health/ready 3. Install dependencies pip install -r requirements.txt 4. Run the sample notebook in jupyter jupyter notebook Uploading your own model \u00b6 The sample model for the example in this readme is already uploaded and available for use. However if you would like to modify the example to use your own ONNX model, all you need to do is to upload your model as model.onnx to S3, GCS or an Azure Blob.","title":"ONNX"},{"location":"modelserving/v1beta1/onnx/#deploy-inferenceservice-with-onnx-model","text":"","title":"Deploy InferenceService with ONNX model"},{"location":"modelserving/v1beta1/onnx/#setup","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible .","title":"Setup"},{"location":"modelserving/v1beta1/onnx/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : model : protocolVersion : v2 modelFormat : name : onnx storageUri : \"gs://kfserving-examples/models/onnx\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : onnx : storageUri : \"gs://kfserving-examples/models/onnx\" Note For the default kserve installation, While using new schema, you must specify protocolVersion as v2 for onnx. Otherwise, you will get a no runtime found error. Expected Output $ inferenceservice.serving.kserve.io/style-sample configured","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/onnx/#run-a-sample-inference","text":"Setup env vars The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT export ISVC_NAME=style-sample export SERVICE_HOSTNAME=$(kubectl get inferenceservice ${ISVC_NAME} -o jsonpath='{.status.url}' | cut -d \"/\" -f 3) 2. Verify the service is healthy curl -v -H \"Host:${SERVICE_HOSTNAME}\" http://localhost:8080//v2/health/ready 3. Install dependencies pip install -r requirements.txt 4. Run the sample notebook in jupyter jupyter notebook","title":"Run a sample inference"},{"location":"modelserving/v1beta1/onnx/#uploading-your-own-model","text":"The sample model for the example in this readme is already uploaded and available for use. However if you would like to modify the example to use your own ONNX model, all you need to do is to upload your model as model.onnx to S3, GCS or an Azure Blob.","title":"Uploading your own model"},{"location":"modelserving/v1beta1/paddle/","text":"Deploy Paddle model with InferenceService \u00b6 In this example, we use a trained paddle resnet50 model to classify images by running an inference service with Paddle predictor. Deploy Paddle model with V1 protocol \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : model : modelFormat : name : paddle storageUri : \"gs://kfserving-examples/models/paddle/resnet\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : paddle : storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the above yaml to create the InferenceService kubectl apply -f paddle.yaml Expected Output $ inferenceservice.serving.kserve.io/paddle-resnet50 created Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload jay.json as the sample input to test the model. MODEL_NAME = paddle-resnet50 SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Expected Output * Trying 127 .0.0.1:80... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 80 ( #0) > POST /v1/models/paddle-resnet50:predict HTTP/1.1 > Host: paddle-resnet50.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3010209 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23399 < content-type: application/json ; charset = UTF-8 < date: Mon, 17 May 2021 03 :34:58 GMT < server: istio-envoy < x-envoy-upstream-service-time: 511 < { \"predictions\" : [[ 6 .736678770380422e-09, 1 .1535990829258935e-08, 5 .142250714129659e-08, 6 .647170636142619e-08, 4 .094492567219277e-08, 1 .3402451770616608e-07, 9 .355561303436843e-08, 2 .8935891904779965e-08, 6 .845367295227334e-08, 7 .680615965455218e-08, 2 .0334689452283783e-06, 1 .1085678579547675e-06, 2 .3477592492326949e-07, 6 .582037030966603e-07, 0 .00012373103527352214, 4 .2878804151769145e-07, 6 .419959845516132e-06, 0 .9993496537208557, 7 .372002437477931e-05, 3 .101135735050775e-05, 5 .6028093240456656e-06, 2 .1862508674530545e-06, 1 .9544044604913324e-08, 3 .728893887000595e-07, 4 .2903633357127546e-07, 1 .8251179767503345e-07, 7 .159925985433802e-08, 9 .231618136595898e-09, 6 .469241498052725e-07, 7 .031690341108288e-09, 4 .451231561120039e-08, 1 .2455971898361895e-07, 9 .44632745358831e-08, 4 .347704418705689e-08, 4 .658220120745682e-07, 6 .797721141538204e-08, 2 .1060276367279585e-07, 2 .2605123106700376e-08, 1 .4311490303953178e-07, 7 .951298641728499e-08, 1 .2341783417468832e-07, 1 .0921713737843675e-06, 1 .5243892448779661e-05, 3 .1173343018053856e-07, 2 .4152058131221565e-07, 6 .863762536113427e-08, 8 .467682022228473e-08, 9 .4246772164297e-08, 1 .0219210366813058e-08, 3 .3770753304906975e-08, 3 .6928835100979995e-08, 1 .3694031508748594e-07, 1 .0674284567357972e-07, 2 .599483650556067e-07, 3 .4866405940192635e-07, 3 .132053549848024e-08, 3 .574873232992104e-07, 6 .64843895492595e-08, 3 .1638955988455564e-07, 1 .2095878219042788e-06, 8 .66409024524728e-08, 4 .0144172430700564e-08, 1 .2544761318622477e-07, 3 .3201178695208e-08, 1 .9731444922399533e-07, 3 .806405572959193e-07, 1 .3827865075199952e-07, 2 .300225965257141e-08, 7 .14422512260171e-08, 2 .851114544455413e-08, 2 .982567437470607e-08, 8 .936032713791064e-08, 6 .22388370175031e-07, 6 .478838798784636e-08, 1 .3663023423760023e-07, 9 .973181391842445e-08, 2 .5761554667269593e-08, 4 .130220077058766e-08, 3 .9384463690339544e-08, 1 .2158079698565416e-07, 4 .302821707824478e-06, 1 .8179063090428826e-06, 1 .8520155435908237e-06, 1 .6246107179540559e-06, 1 .6448313544970006e-05, 1 .0544916221988387e-05, 3 .993061909568496e-06, 2 .646479799750523e-07, 1 .9193475964129902e-05, 4 .803242745765601e-07, 1 .696285067964709e-07, 4 .550505764200352e-06, 4 .235929372953251e-05, 4 .443338639248395e-06, 5 .104009687784128e-06, 1 .3506396498996764e-05, 4 .1758724478313525e-07, 4 .494491463447048e-07, 3 .156698369366495e-07, 1 .0557599807725637e-06, 1 .336463917311903e-08, 1 .3893659556174498e-08, 6 .770379457066156e-08, 1 .4129696523923485e-07, 7 .170518756538513e-08, 7 .934466594861078e-08, 2 .639154317307657e-08, 2 .6134321373660896e-08, 7 .196725881897237e-09, 2 .1752363466021052e-08, 6 .684639686227456e-08, 3 .417795824134373e-08, 1 .6228275967478112e-07, 4 .107114648377319e-07, 6 .472135396506928e-07, 2 .951379372007068e-07, 5 .653474133282543e-09, 4 .830144462175667e-08, 8 .887481861563629e-09, 3 .7306168820805397e-08, 1 .7784264727538357e-08, 4 .641905082536368e-09, 3 .413118676576232e-08, 1 .937393818707278e-07, 1 .2980176506971475e-06, 3 .5641004814124244e-08, 2 .149332445355867e-08, 3 .055293689158134e-07, 1 .5532516783878236e-07, 1 .4520978766086046e-06, 3 .488464628276233e-08, 3 .825438398052938e-05, 4 .5088432898410247e-07, 4 .1766969616219285e-07, 6 .770622462681786e-07, 1 .4142248971893423e-07, 1 .4235997696232516e-05, 6 .293820433711517e-07, 4 .762866865348769e-06, 9 .024900577969674e-07, 9 .058987870957935e-07, 1 .5713684433649178e-06, 1 .5720647184025438e-07, 1 .818536503606083e-07, 7 .193188622522939e-08, 1 .1952824934269302e-06, 8 .874837362782273e-07, 2 .0870831463071227e-07, 9 .906239029078279e-08, 7 .793621747964607e-09, 1 .0058498389753368e-07, 4 .2059440374941914e-07, 1 .843624630737395e-07, 1 .6437947181202617e-07, 7 .025352743994517e-08, 2 .570448600636155e-07, 7 .586877615040066e-08, 7 .841313731660193e-07, 2 .495309274763713e-07, 5 .157681925993529e-08, 4 .0674127177453556e-08, 7 .531796519799627e-09, 4 .797485431140558e-08, 1 .7419973019627832e-08, 1 .7958679165985814e-07, 1 .2566392371127222e-08, 8 .975440124459055e-08, 3 .26965476915575e-08, 1 .1208359751435637e-07, 3 .906746215420753e-08, 4 .6769045525252295e-08, 1 .8523553535487736e-07, 1 .4833052830454108e-07, 1 .2279349448363064e-07, 1 .0729105497375713e-06, 3 .6538490011395197e-09, 1 .6198403329781286e-07, 1 .6190719875908144e-08, 1 .2004933580556099e-07, 1 .4800277448046018e-08, 4 .02294837442696e-08, 2 .15060893538066e-07, 1 .1925696696835075e-07, 4 .8982514044837444e-08, 7 .608920071788816e-08, 2 .3137479487900237e-08, 8 .521050176568679e-08, 9 .586213423062873e-08, 1 .3351650807180704e-07, 3 .021699157557123e-08, 4 .423876376336011e-08, 2 .610667060309879e-08, 2 .3977091245797055e-07, 1 .3192564551900432e-07, 1 .6734931662654162e-08, 1 .588336999702733e-07, 4 .0643516285854275e-07, 8 .753454494581092e-08, 8 .366999395548191e-07, 3 .437598650180007e-08, 7 .847892646850596e-08, 8 .526394701391382e-09, 9 .601382799928615e-08, 5 .258924034023948e-07, 1 .3557448141909845e-07, 1 .0307226716577134e-07, 1 .0429813457335513e-08, 5 .187714435805901e-08, 2 .187001335585137e-08, 1 .1791439824548888e-08, 2 .98065643278278e-08, 4 .338393466696289e-08, 2 .9991046091026874e-08, 2 .8507610494443725e-08, 3 .058665143385042e-08, 6 .441099031917474e-08, 1 .5364101102477434e-08, 1 .5973883549236234e-08, 2 .5736850872704053e-08, 1 .0903765712555469e-07, 3 .2118737891551064e-08, 6 .819742992547617e-09, 1 .9251311300649832e-07, 5 .8258109447706374e-08, 1 .8765761922168167e-07, 4 .0070790419122204e-07, 1 .5791577823165426e-08, 1 .950158434738114e-07, 1 .0142063189277906e-08, 2 .744815041921811e-08, 1 .2843531571604672e-08, 3 .7297493094001766e-08, 7 .407496838141014e-08, 4 .20607833007125e-08, 1 .6924804668860816e-08, 1 .459203531339881e-07, 4 .344977000414474e-08, 1 .7191403856031684e-07, 3 .5817443233554513e-08, 8 .440249388286247e-09, 4 .194829728021432e-08, 2 .514032360068086e-08, 2 .8340199520471288e-08, 8 .747196034164517e-08, 8 .277125651545703e-09, 1 .1676293709683705e-08, 1 .4548514570833504e-07, 7 .200282148289716e-09, 2 .623600948936655e-06, 5 .675736929333652e-07, 1 .9483527466945816e-06, 6 .752595282932816e-08, 8 .168475318370838e-08, 1 .0933046468153407e-07, 1 .670913718498923e-07, 3 .1387276777650186e-08, 2 .973524537708272e-08, 5 .752163900751839e-08, 5 .850877471402782e-08, 3 .2544622285968217e-07, 3 .330221431951941e-08, 4 .186786668469722e-07, 1 .5085906568401697e-07, 2 .3346819943981245e-07, 2 .86402780602657e-07, 2 .2940319865938363e-07, 1 .8537603807544656e-07, 3 .151798182443599e-07, 1 .1075967449869495e-06, 1 .5369782602192572e-07, 1 .9237509718550427e-07, 1 .64044664074936e-07, 2 .900835340824415e-07, 1 .246654903752642e-07, 5 .802622027317739e-08, 5 .186220519703966e-08, 6 .0094205167615655e-09, 1 .2333241272699524e-07, 1 .3798474185477971e-07, 1 .7370231830682314e-07, 5 .617761189569137e-07, 5 .1604470030497396e-08, 4 .813277598714194e-08, 8 .032698417537176e-08, 2 .0645263703045202e-06, 5 .638597713186755e-07, 8 .794199857220519e-07, 3 .4785980460583232e-06, 2 .972389268052211e-07, 3 .3904532870110415e-07, 9 .469074058188198e-08, 3 .754845678827223e-08, 1 .5679037801419327e-07, 8 .203105039683578e-08, 6 .847962641387539e-09, 1 .8251624211984563e-08, 6 .050240841659615e-08, 3 .956342808919544e-08, 1 .0699947949888156e-07, 3 .2566634899922065e-07, 3 .5369430406717584e-07, 7 .326295303755614e-08, 4 .85765610847011e-07, 7 .717713401689252e-07, 3 .4567779749750116e-08, 3 .246204585138912e-07, 3 .1608601602783892e-06, 5 .33099466792919e-08, 3 .645687343123427e-07, 5 .48158936908294e-07, 4 .62306957160763e-08, 1 .3466177506415988e-07, 4 .3529482240955986e-08, 1 .6404105451783835e-07, 2 .463695381038633e-08, 5 .958712634424046e-08, 9 .493651020875404e-08, 5 .523462576206839e-08, 5 .7412357534758485e-08, 1 .1850350347231142e-05, 5 .8263944993086625e-06, 7 .4208674050169066e-06, 9 .127966222877149e-07, 2 .0019581370434025e-06, 1 .033498961078294e-06, 3 .5146850763112525e-08, 2 .058995278275688e-06, 3 .5655509122989315e-07, 6 .873234070781109e-08, 2 .1935298022413008e-09, 5 .560363547374436e-08, 3 .3266996979364194e-07, 1 .307369217329324e-07, 2 .718762992515167e-08, 1 .0462929189714032e-08, 7 .466680358447775e-07, 6 .923166040451179e-08, 1 .6145664361033596e-08, 8 .568521003837759e-09, 4 .76221018175238e-09, 1 .233977116044116e-07, 8 .340628632197422e-09, 3 .2649041248333788e-09, 5 .0632489312363305e-09, 4 .0704994930251814e-09, 1 .2043538610839732e-08, 5 .105608380517879e-09, 7 .267142887457112e-09, 1 .184516307262129e-07, 7 .53557927168913e-08, 6 .386964201965384e-08, 1 .6212936770898523e-08, 2 .610429419291904e-07, 6 .979425393183192e-07, 6 .647513117741255e-08, 7 .717492849224072e-07, 6 .651206945207377e-07, 3 .324495310152997e-07, 3 .707282019149716e-07, 3 .99564243025452e-07, 6 .411632114122767e-08, 7 .107352217872176e-08, 1 .6380016631956096e-07, 6 .876800995314625e-08, 3 .462474467141874e-07, 2 .0256503319160402e-07, 6 .19610148078209e-07, 2 .6841073363925716e-08, 6 .720335363752383e-07, 1 .1348340649419697e-06, 1 .8397931853542104e-06, 6 .397251581802266e-07, 7 .257533241045167e-08, 4 .2213909523525217e-07, 3 .9657925299252383e-07, 1 .4037439655112394e-07, 3 .249856774800719e-07, 1 .5857655455420172e-07, 1 .1122217102865761e-07, 7 .391420808744442e-08, 3 .42322238111592e-07, 5 .39796154441774e-08, 8 .517296379295658e-08, 4 .061009803990601e-06, 1 .4478755474556237e-05, 7 .317032757470088e-09, 6 .9484960008026064e-09, 4 .468917325084476e-08, 9 .23141172393116e-08, 5 .411982328951126e-08, 2 .2242811326123046e-07, 1 .7609554703312824e-08, 2 .0906279374344194e-08, 3 .6797682678724186e-09, 6 .177919686933819e-08, 1 .7920288541972695e-07, 2 .6279179721200308e-08, 2 .6988200119149042e-08, 1 .6432807115052128e-07, 1 .2827612749788386e-07, 4 .468908798571647e-08, 6 .316552969565237e-08, 1 .9461760203398626e-08, 2 .087125849925542e-08, 2 .2414580413965268e-08, 2 .4765244077684656e-08, 6 .785398465325443e-09, 2 .4248794971981624e-08, 4 .554979504689527e-09, 2 .8977037658250993e-08, 2 .0402325162649504e-08, 1 .600950270130852e-07, 2 .0199709638291097e-07, 1 .611188515937556e-08, 5 .964113825029926e-08, 4 .098318573397819e-09, 3 .9080127578472457e-08, 7 .511338218080255e-09, 5 .965624154669058e-07, 1 .6478223585636442e-07, 1 .4106989354445432e-08, 3 .2855584919389e-08, 3 .3387166364917675e-09, 1 .220043444050134e-08, 4 .624639160510924e-08, 6 .842309385746148e-09, 1 .74262879681919e-08, 4 .6611329906909305e-08, 9 .331947836699328e-08, 1 .2306078644996887e-07, 1 .2359445022980253e-08, 1 .1173199254699284e-08, 2 .7724862405875683e-08, 2 .419210147763806e-07, 3 .451186785241589e-07, 2 .593766978975509e-08, 9 .964568192799561e-08, 9 .797809674694236e-09, 1 .9085564417764544e-07, 3 .972706252852731e-08, 2 .6639204619982593e-08, 6 .874148805735558e-09, 3 .146993776681484e-08, 2 .4086594407890516e-07, 1 .3126927456141857e-07, 2 .1254339799270383e-07, 2 .050203384840188e-08, 3 .694976058454813e-08, 6 .563175816154398e-07, 2 .560050127442537e-08, 2 .6882981174480847e-08, 6 .880636078676616e-07, 2 .0092733166166e-07, 2 .788039665801989e-08, 2 .628409134786125e-08, 5 .1678345158734373e-08, 1 .8935413947929192e-07, 4 .61852835087484e-07, 1 .1086777718105623e-08, 1 .4542604276357451e-07, 2 .8737009216683873e-08, 6 .105167926762078e-07, 1 .2016463379893594e-08, 1 .3944705301582871e-07, 2 .093712758721722e-08, 4 .3801410498645055e-08, 1 .966320795077081e-08, 6 .654448991838535e-09, 1 .1149590584125235e-08, 6 .424939158478082e-08, 6 .971554888934861e-09, 3 .260019587614238e-09, 1 .4260189473702667e-08, 2 .7895078247297533e-08, 8 .11578289017234e-08, 2 .5995715802196173e-08, 2 .2855578762914774e-08, 1 .055962854934478e-07, 8 .145542551574181e-08, 3 .7793686402665116e-08, 4 .881891513264236e-08, 2 .342062366267328e-08, 1 .059935517133681e-08, 3 .604105103249822e-08, 5 .062430830093945e-08, 3 .6804440384230475e-08, 1 .501580193519203e-09, 1 .4475033367489232e-06, 1 .076210423889279e-06, 1 .304991315009829e-07, 3 .073601462233455e-08, 1 .7184021317007137e-08, 2 .0421090596300928e-08, 7 .904992216367646e-09, 1 .6902052379919041e-07, 1 .2416506933732308e-08, 5 .4758292122869534e-08, 2 .6250422280327257e-08, 1 .3261367115546818e-08, 6 .29807459517906e-08, 1 .270998595259698e-08, 2 .0171681569536304e-07, 4 .386637186826192e-08, 6 .962349630157405e-08, 2 .9565120485131047e-07, 7 .925131626507209e-07, 2 .0868920103112032e-07, 1 .7341794489311724e-07, 4 .2942417621816276e-08, 4 .213406956665722e-09, 8 .824785169281313e-08, 1 .7341569957807224e-08, 7 .321587247588468e-08, 1 .7941774288487977e-08, 1 .1245148101579616e-07, 4 .242405395871174e-07, 8 .259573469615589e-09, 1 .1336403105133286e-07, 8 .268798978861014e-08, 2 .2186977588489754e-08, 1 .9539720952366224e-08, 1 .0675703876472653e-08, 3 .288517547161973e-08, 2 .4340963022950746e-08, 6 .639137239972115e-08, 5 .604687380866835e-09, 1 .386604697728444e-08, 6 .675873720496384e-08, 1 .1355886009312144e-08, 3 .132159633878473e-07, 3 .12451788886392e-08, 1 .502181845580708e-07, 1 .3461754377885882e-08, 1 .8882955998833495e-07, 4 .645742279762999e-08, 4 .6453880742092224e-08, 7 .714453964524637e-09, 3 .5857155467056145e-08, 7 .60832108426257e-09, 4 .221501370693659e-08, 4 .3407251126836854e-09, 1 .340157496088068e-08, 8 .565600495558101e-08, 1 .7045413969185574e-08, 5 .4221903411644234e-08, 3 .021912675649219e-08, 6 .153376119755194e-08, 3 .938857240370908e-09, 4 .135628017820636e-08, 1 .781920389021252e-08, 4 .3105885083605244e-08, 3 .903354972578654e-09, 7 .663085455078544e-08, 1 .1890405993142394e-08, 9 .304217840622186e-09, 1 .0968062014171664e-09, 1 .0536767902635802e-08, 1 .1516804221400889e-07, 8 .134522886393825e-07, 5 .952623993721318e-08, 2 .806350174466843e-08, 1 .2833099027886874e-08, 1 .0605690192733164e-07, 7 .872949936427176e-07, 2 .7501393162765453e-08, 3 .936289072470345e-09, 2 .0519442145428002e-08, 7 .394815870753746e-09, 3 .598397313453461e-08, 2 .5378517065632877e-08, 4 .698972233541099e-08, 7 .54952989012736e-09, 6 .322805461422831e-07, 5 .582006412652163e-09, 1 .29640980617296e-07, 1 .5874988434916304e-08, 3 .3837810775594335e-08, 6 .474512037613067e-09, 9 .121148281110436e-08, 1 .3918511676536127e-08, 8 .230025549949005e-09, 2 .7061290097663004e-08, 2 .6095918315149902e-08, 5 .722363471960534e-09, 6 .963475698285038e-07, 4 .685091781198025e-08, 9 .590579885809802e-09, 2 .099205858030473e-07, 3 .082160660028421e-08, 3 .563162565001221e-08, 7 .326312925215461e-07, 2 .1759731225756695e-06, 2 .407518309155421e-07, 2 .974515780351794e-07, 2 .529018416908002e-08, 7 .667950718825978e-09, 2 .663289251358947e-07, 3 .4358880185436647e-08, 2 .3130198201215535e-08, 3 .1239693498719134e-08, 2 .8691621878351725e-07, 3 .895845068768722e-08, 2 .4184130253956937e-08, 1 .1582445225144511e-08, 5 .1545349322168477e-08, 2 .034345492063494e-08, 8 .201963197507212e-08, 1 .164153573540716e-08, 5 .496356720868789e-07, 1 .1682151246361627e-08, 4 .7576914852243135e-08, 1 .6349824605299546e-08, 4 .090862759653646e-08, 2 .1271189609706198e-07, 1 .6697286753242224e-07, 3 .989708119433999e-08, 2 .852450279533514e-06, 1 .2500372292834072e-07, 2 .4846613655427063e-07, 1 .245429093188477e-08, 2 .9700272463628608e-08, 4 .250991558762962e-09, 1 .61443480806156e-07, 2 .6386018703306036e-07, 7 .638056409575711e-09, 3 .4455793773702226e-09, 7 .273289526210647e-08, 1 .7631434090503717e-08, 7 .58661311550668e-09, 2 .1547013062672704e-08, 1 .2675349125856883e-07, 2 .5637149292379036e-08, 3 .500976220038865e-08, 6 .472243541111311e-08, 8 .387915251262257e-09, 3 .069512288789156e-08, 7 .520387867998579e-08, 1 .5724964441687916e-07, 1 .9634005354873807e-07, 1 .2290831818972947e-07, 1 .112118730439704e-09, 1 .546895944670723e-08, 9 .91701032404535e-09, 6 .882473257974198e-07, 8 .267616635748709e-08, 4 .469531234008173e-08, 2 .075201344098332e-08, 8 .649378457903367e-08, 5 .202766573120243e-08, 4 .5564942041664835e-08, 2 .0319955496006514e-08, 8 .705182352741758e-09, 6 .452066969586667e-08, 2 .1777438519166026e-08, 1 .030954166481024e-08, 3 .211904342492744e-08, 2 .3336936294526822e-07, 8 .054096056753224e-09, 1 .9623354319264763e-07, 1 .2888089884199871e-07, 1 .5392496166555247e-08, 1 .401903038100727e-09, 5 .696818305978013e-08, 6 .080025372057207e-09, 1 .0782793324892737e-08, 2 .4260730313585555e-08, 1 .9388659566743627e-08, 2 .2970310453729326e-07, 1 .9971754028347277e-08, 2 .8477993296860404e-08, 5 .2273552597625894e-08, 2 .7392806600801123e-07, 9 .857291161097237e-08, 3 .12910977129377e-08, 4 .151442212219081e-08, 5 .251196366629074e-09, 1 .580681100676884e-06, 8 .547603442821128e-07, 1 .068913135782168e-08, 1 .0621830597301596e-06, 7 .737313012512459e-08, 6 .394216711669287e-08, 1 .1698345758759388e-07, 1 .0486609625104393e-07, 2 .1161000063329993e-07, 1 .53396815250062e-08, 5 .094453570109181e-08, 1 .4005379966874898e-08, 2 .6282036102998063e-08, 8 .778433624456738e-08, 7 .772066545896905e-09, 4 .228875383205377e-08, 3 .3243779284930497e-07, 7 .729244799747903e-08, 7 .636901111496286e-10, 5 .989500806435899e-08, 1 .326090597331131e-07, 1 .2853634245857393e-07, 8 .844242671557367e-09, 1 .0194374766570036e-07, 2 .493779334145074e-07, 1 .6547971881664125e-07, 1 .1762754326127833e-08, 1 .1496195639892903e-07, 2 .9342709240154363e-07, 1 .326124099421122e-08, 8 .630262726683213e-08, 5 .7394842656322e-08, 1 .1094081031615133e-07, 2 .2933713239581266e-07, 3 .4706170026765903e-07, 1 .4751107357824367e-07, 1 .502495017291494e-08, 6 .454319390059027e-08, 5 .164533689594464e-08, 6 .23741556182722e-08, 1 .293601457064142e-07, 1 .4052071506398534e-08, 5 .386946000385251e-08, 2 .0827554791935654e-08, 1 .3040637902861363e-08, 1 .0578981601838677e-07, 1 .5079727688771527e-08, 8 .92632726845477e-07, 4 .6374381668101705e-08, 7 .481006036869076e-07, 5 .883147302654379e-09, 2 .8707685117979054e-09, 8 .381598490814213e-07, 7 .341958596640552e-09, 1 .4245998158912698e-08, 1 .0926417104428765e-07, 1 .1308178216040687e-07, 2 .52339901862797e-07, 1 .1782835684925885e-07, 4 .6678056975224536e-08, 2 .7959197179683315e-09, 3 .4363861090014325e-08, 1 .4674496640054713e-07, 3 .5396915620822256e-08, 2 .0581127557761647e-07, 7 .18387909159901e-08, 2 .7693943138729082e-08, 4 .5493386835460115e-08, 1 .9559182717898693e-08, 1 .5359708172013598e-08, 1 .2336623278486059e-08, 2 .9570605519779747e-08, 2 .877552560676122e-07, 9 .051845495378075e-07, 2 .3732602016934834e-07, 1 .6521676471370483e-08, 1 .5478875070584763e-08, 3 .526786329643983e-08, 3 .616410637619083e-08, 1 .61590953950963e-08, 7 .65007328595857e-08, 1 .9661483108279754e-08, 4 .917534823789538e-08, 1 .1712612746350715e-07, 1 .0889253054813253e-08, 1 .494120169809321e-06, 1 .018585660261806e-08, 3 .7575969003000864e-08, 2 .097097784314883e-08, 3 .368558054717141e-08, 4 .845588819080149e-09, 6 .039624622644624e-07, 1 .037331109898787e-08, 2 .841650257323636e-07, 4 .4990630954089283e-07, 3 .463186004637464e-08, 7 .720684180867465e-08, 1 .471122175189521e-07, 1 .1601575522490748e-07, 4 .007488030310924e-07, 3 .025649775167949e-08, 6 .706784461130155e-08, 2 .0128741340386114e-08, 1 .5987744461654074e-09, 4 .1919822280078733e-08, 1 .3167154477855547e-08, 3 .231814815762846e-08, 9 .247659704669786e-08, 1 .3075300842047e-07, 1 .0574301256838226e-07, 3 .762165334819656e-08, 1 .0942246575496029e-07, 7 .001474955359299e-08, 2 .742706151082075e-08, 2 .0766625752344225e-08, 4 .5403403703403455e-08, 3 .39040298058535e-08, 1 .0469661759771043e-07, 2 .8271578855765256e-08, 3 .406226767310727e-07, 5 .146206945028098e-07, 6 .740708613506285e-07, 6 .382248063374618e-09, 3 .63878704945364e-08, 3 .626059807970705e-08, 1 .6065602892467723e-07, 3 .639055989879125e-07, 6 .232691696084203e-09, 4 .805490050330263e-08, 3 .372633727849461e-08, 6 .328880317596486e-07, 6 .480631498106959e-08, 2 .1165197949812864e-07, 8 .38779143919055e-08, 1 .7589144363228115e-08, 2 .729027670511641e-09, 2 .144795097080987e-08, 7 .861271456022223e-08, 2 .0118186228046397e-08, 2 .8407685093156942e-08, 2 .4922530883486615e-07, 2 .0156670998972004e-08, 2 .6551649767725394e-08, 2 .7848242822869906e-08, 6 .907123761834555e-09, 1 .880543720744754e-08, 1 .3006903998302732e-08, 3 .685918272822164e-07, 3 .967941211158177e-07, 2 .7592133022835696e-08, 2 .5228947819755376e-08, 1 .547002881352455e-07, 3 .689306637966183e-08, 1 .440177199718562e-09, 2 .1504929392790473e-08, 5 .068111263994979e-08, 5 .081711407228795e-08, 1 .171875219085905e-08, 5 .409278358570191e-08, 7 .138276600926474e-07, 2 .5237213208129106e-07, 7 .072044638789521e-08, 7 .199763984999663e-08, 1 .2525473103153217e-08, 3 .4803417747752974e-07, 1 .9591827538079087e-07, 1 .2404700555634918e-07, 1 .234617457157583e-07, 1 .9201337408958352e-08, 1 .9895249181445251e-07, 3 .7876677794201896e-08, 1 .0629785052174157e-08, 1 .2437127772102485e-08, 2 .1861892207653e-07, 2 .6181456291851646e-07, 1 .112900775979142e-07, 1 .0776630432474121e-07, 6 .380325157095967e-09, 3 .895085143312826e-09, 1 .5762756788717525e-07, 2 .909027019271093e-09, 1 .0381050685737137e-08, 2 .8135211493918177e-08, 1 .0778002490496874e-08, 1 .3605974125141529e-08, 2 .9236465692861202e-08, 1 .9189795352758665e-07, 2 .199506354827463e-07, 1 .326399790002597e-08, 4 .9004846403022384e-08, 2 .980837132682268e-09, 8 .926045680368588e-09, 1 .0996975774446582e-08, 7 .71560149104289e-09, 7 .454491246505768e-09, 5 .086162246925596e-08, 1 .5129764108223753e-07, 1 .1960075596562092e-08, 1 .1323334270230134e-08, 9 .391332156383214e-09, 9 .585701832293125e-08, 1 .905532798218701e-08, 1 .8105303922766325e-08, 6 .179227796110354e-08, 6 .389401363549041e-08, 1 .1853179771037503e-08, 9 .37277544466042e-09, 1 .2332148457971925e-07, 1 .6522022860954166e-08, 1 .246116454467483e-07, 4 .196171854431441e-09, 3 .996593278543514e-08, 1 .2554556505506298e-08, 1 .4302138140465104e-08, 6 .631793780798034e-09, 5 .964224669696705e-09, 5 .556936244488497e-09, 1 .4192455921602232e-07, 1 .7613080771639034e-08, 3 .380189639301534e-07, 7 .85651934620546e-08, 2 .966783085867064e-08, 2 .8992105853831163e-06, 1 .3787366697215475e-06, 5 .313622430946907e-09, 2 .512852859126724e-08, 8 .406627216572815e-08, 4 .492839167369311e-08, 5 .408793057881667e-08, 2 .4239175999696272e-08, 4 .016805235096399e-07, 4 .1083545454512205e-08, 5 .4153481698904216e-08, 8 .640767212853007e-09, 5 .773256717134245e-08, 2 .6443152023603034e-07, 8 .953217047746875e-07, 2 .7994001783326894e-08, 5 .889480014786841e-09, 4 .1788819515886644e-08, 2 .8880645430717777e-08, 2 .135752907861388e-08, 2 .3024175277441827e-07, 8 .786625471657317e-08, 2 .0697297209437693e-09, 2 .236410523437371e-08, 3 .203276310870251e-09, 1 .176874686592555e-08, 6 .963571053120177e-08, 2 .271932153519174e-08, 7 .360382525689602e-09, 6 .922528772435044e-09, 3 .213871480056696e-08, 1 .370577820125618e-07, 1 .9815049157045905e-08, 1 .0578956377571558e-08, 2 .7049420481262132e-08, 2 .9755937713815683e-09, 2 .1773699288019088e-08, 1 .09755387001087e-08, 1 .991872444762066e-08, 2 .3882098076910552e-08, 2 .1357365653784655e-08, 6 .109098560358461e-09, 1 .1890497475519624e-08, 1 .1459891702259029e-08, 3 .73173456580389e-08, 1 .572620256240498e-08, 3 .404023374287135e-08, 3 .6921580459647885e-08, 9 .281765045443535e-08, 1 .2323201303843234e-07, 4 .2347593876002065e-08, 1 .7423728237986325e-08, 5 .8113389656000436e-08, 3 .931436154402945e-08, 2 .3690461148362374e-08, 1 .792850135018398e-08, 1 .440664210150544e-08, 7 .019830494670032e-09, 6 .041522482291839e-08, 4 .867479930226182e-08, 1 .0685319296044327e-08, 1 .0051243393149889e-08, 4 .2426261614991745e-08, 2 .607815297039906e-08, 5 .136670200300 841e-09, 1 .69729952315123e-09, 1 .9131586981302462e-08, 2 .111743526711507e-07, 1 .337269672774255e-08, 2 .0002481448955223e-08, 1 .0454256482717028e-07, 2 .8144228281234973e-08, 2 .1344791889532644e-07, 2 .1046110632028103e-08, 1 .9114453664315079e-07, 3 .957693550660224e-08, 2 .931631826186276e-08, 1 .105203111251285e-07, 4 .84007678380749e-08, 5 .583606110803885e-08, 1 .2130111315400427e-07, 1 .77621615193857e-08, 2 .5610853882085394e-08, 1 .203865309662433e-07, 4 .674859610531712e-09, 1 .5916098661250544e-08, 3 .147594185293201e-08, 6 .147686093527227e-08, 2 .204641802450169e-08, 3 .257763410147163e-07, 1 .198914532096751e-07, 2 .3818989802748547e-07, 1 .4909986134625797e-08, 5 .10168831624469e-08, 5 .5142201915714395e-08, 2 .288550327023131e-08, 5 .714110073995471e-08, 5 .185095801607531e-07, 4 .977285783525076e-08, 1 .1049896109227575e-08, 1 .264099296349741e-07, 8 .174881571676451e-08 ]]} * Connection #0 to host localhost left intact Deploy the model with Open Inference Protocol \u00b6 Test the Model locally \u00b6 Once you've got your model serialised model.pdmodel , we can then use KServe Paddle Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Using KServe PaddleServer \u00b6 Pre-requisites \u00b6 Firstly, to use KServe Paddle server locally, you will first need to install the paddleserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install paddleserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/paddleserver poetry install Serving model locally \u00b6 The paddleserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the paddleserver runtime package installed locally, you should now be ready to start our server as: python3 paddleserver --model_dir /path/to/model_dir --model_name paddle-v2-resnet50 Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f paddle-v2-resnet.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can use the example payload jay-v2.json as the sample input to test the model. Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/paddle-v2-resnet50/infer Expected Output { \"model_name\" : \"paddle-v2-resnet50\" , \"model_version\" : null , \"id\" : \"afa5ec0b-a5c7-454b-a464-53ba931b22df\" , \"parameters\" : null , \"outputs\" :[{ \"name\" : \"output-0\" , \"shape\" :[ 1 , 1000 ], \"datatype\" : \"FP32\" , \"parameters\" : null , \"data\" :[ 6.736678770380422e-9 , 1.1535990829258935e-8 , 5.142250714129659e-8 , 6.647170636142619e-8 , 4.094492567219277e-8 , 1.3402451770616608e-7 , 9.355561303436843e-8 , 2.8935891904779965e-8 , 6.845367295227334e-8 , 7.680615965455218e-8 , 0.0000020334689452283783 , 0.0000011085678579547675 , 2.3477592492326949e-7 , 6.582037030966603e-7 , 0.00012373103527352214 , 4.2878804151769145e-7 , 0.000006419959845516132 , 0.9993496537208557 , 0.00007372002437477931 , 0.00003101135735050775 , 0.0000056028093240456656 , 0.0000021862508674530545 , 1.9544044604913324e-8 , 3.728893887000595e-7 , 4.2903633357127546e-7 , 1.8251179767503345e-7 , 7.159925985433802e-8 , 9.231618136595898e-9 , 6.469241498052725e-7 , 7.031690341108288e-9 , 4.451231561120039e-8 , 1.2455971898361895e-7 , 9.44632745358831e-8 , 4.347704418705689e-8 , 4.658220120745682e-7 , 6.797721141538204e-8 , 2.1060276367279585e-7 , 2.2605123106700376e-8 , 1.4311490303953178e-7 , 7.951298641728499e-8 , 1.2341783417468832e-7 , 0.0000010921713737843675 , 0.000015243892448779661 , 3.1173343018053856e-7 , 2.4152058131221565e-7 , 6.863762536113427e-8 , 8.467682022228473e-8 , 9.4246772164297e-8 , 1.0219210366813058e-8 , 3.3770753304906975e-8 , 3.6928835100979995e-8 , 1.3694031508748594e-7 , 1.0674284567357972e-7 , 2.599483650556067e-7 , 3.4866405940192635e-7 , 3.132053549848024e-8 , 3.574873232992104e-7 , 6.64843895492595e-8 , 3.1638955988455564e-7 , 0.0000012095878219042788 , 8.66409024524728e-8 , 4.0144172430700564e-8 , 1.2544761318622477e-7 , 3.3201178695208e-8 , 1.9731444922399533e-7 , 3.806405572959193e-7 , 1.3827865075199952e-7 , 2.300225965257141e-8 , 7.14422512260171e-8 , 2.851114544455413e-8 , 2.982567437470607e-8 , 8.936032713791064e-8 , 6.22388370175031e-7 , 6.478838798784636e-8 , 1.3663023423760023e-7 , 9.973181391842445e-8 , 2.5761554667269593e-8 , 4.130220077058766e-8 , 3.9384463690339544e-8 , 1.2158079698565416e-7 , 0.000004302821707824478 , 0.0000018179063090428826 , 0.0000018520155435908237 , 0.0000016246107179540559 , 0.000016448313544970006 , 0.000010544916221988387 , 0.000003993061909568496 , 2.646479799750523e-7 , 0.000019193475964129902 , 4.803242745765601e-7 , 1.696285067964709e-7 , 0.000004550505764200352 , 0.00004235929372953251 , 0.000004443338639248395 , 0.000005104009687784128 , 0.000013506396498996764 , 4.1758724478313525e-7 , 4.494491463447048e-7 , 3.156698369366495e-7 , 0.0000010557599807725637 , 1.336463917311903e-8 , 1.3893659556174498e-8 , 6.770379457066156e-8 , 1.4129696523923485e-7 , 7.170518756538513e-8 , 7.934466594861078e-8 , 2.639154317307657e-8 , 2.6134321373660896e-8 , 7.196725881897237e-9 , 2.1752363466021052e-8 , 6.684639686227456e-8 , 3.417795824134373e-8 , 1.6228275967478112e-7 , 4.107114648377319e-7 , 6.472135396506928e-7 , 2.951379372007068e-7 , 5.653474133282543e-9 , 4.830144462175667e-8 , 8.887481861563629e-9 , 3.7306168820805397e-8 , 1.7784264727538357e-8 , 4.641905082536368e-9 , 3.413118676576232e-8 , 1.937393818707278e-7 , 0.0000012980176506971475 , 3.5641004814124244e-8 , 2.149332445355867e-8 , 3.055293689158134e-7 , 1.5532516783878236e-7 , 0.0000014520978766086046 , 3.488464628276233e-8 , 0.00003825438398052938 , 4.5088432898410247e-7 , 4.1766969616219285e-7 , 6.770622462681786e-7 , 1.4142248971893423e-7 , 0.000014235997696232516 , 6.293820433711517e-7 , 0.000004762866865348769 , 9.024900577969674e-7 , 9.058987870957935e-7 , 0.0000015713684433649178 , 1.5720647184025438e-7 , 1.818536503606083e-7 , 7.193188622522939e-8 , 0.0000011952824934269302 , 8.874837362782273e-7 , 2.0870831463071227e-7 , 9.906239029078279e-8 , 7.793621747964607e-9 , 1.0058498389753368e-7 , 4.2059440374941914e-7 , 1.843624630737395e-7 , 1.6437947181202617e-7 , 7.025352743994517e-8 , 2.570448600636155e-7 , 7.586877615040066e-8 , 7.841313731660193e-7 , 2.495309274763713e-7 , 5.157681925993529e-8 , 4.0674127177453556e-8 , 7.531796519799627e-9 , 4.797485431140558e-8 , 1.7419973019627832e-8 , 1.7958679165985814e-7 , 1.2566392371127222e-8 , 8.975440124459055e-8 , 3.26965476915575e-8 , 1.1208359751435637e-7 , 3.906746215420753e-8 , 4.6769045525252295e-8 , 1.8523553535487736e-7 , 1.4833052830454108e-7 , 1.2279349448363064e-7 , 0.0000010729105497375713 , 3.6538490011395197e-9 , 1.6198403329781286e-7 , 1.6190719875908144e-8 , 1.2004933580556099e-7 , 1.4800277448046018e-8 , 4.02294837442696e-8 , 2.15060893538066e-7 , 1.1925696696835075e-7 , 4.8982514044837444e-8 , 7.608920071788816e-8 , 2.3137479487900237e-8 , 8.521050176568679e-8 , 9.586213423062873e-8 , 1.3351650807180704e-7 , 3.021699157557123e-8 , 4.423876376336011e-8 , 2.610667060309879e-8 , 2.3977091245797055e-7 , 1.3192564551900432e-7 , 1.6734931662654162e-8 , 1.588336999702733e-7 , 4.0643516285854275e-7 , 8.753454494581092e-8 , 8.366999395548191e-7 , 3.437598650180007e-8 , 7.847892646850596e-8 , 8.526394701391382e-9 , 9.601382799928615e-8 , 5.258924034023948e-7 , 1.3557448141909845e-7 , 1.0307226716577134e-7 , 1.0429813457335513e-8 , 5.187714435805901e-8 , 2.187001335585137e-8 , 1.1791439824548888e-8 , 2.98065643278278e-8 , 4.338393466696289e-8 , 2.9991046091026874e-8 , 2.8507610494443725e-8 , 3.058665143385042e-8 , 6.441099031917474e-8 , 1.5364101102477434e-8 , 1.5973883549236234e-8 , 2.5736850872704053e-8 , 1.0903765712555469e-7 , 3.2118737891551064e-8 , 6.819742992547617e-9 , 1.9251311300649832e-7 , 5.8258109447706374e-8 , 1.8765761922168167e-7 , 4.0070790419122204e-7 , 1.5791577823165426e-8 , 1.950158434738114e-7 , 1.0142063189277906e-8 , 2.744815041921811e-8 , 1.2843531571604672e-8 , 3.7297493094001766e-8 , 7.407496838141014e-8 , 4.20607833007125e-8 , 1.6924804668860816e-8 , 1.459203531339881e-7 , 4.344977000414474e-8 , 1.7191403856031684e-7 , 3.5817443233554513e-8 , 8.440249388286247e-9 , 4.194829728021432e-8 , 2.514032360068086e-8 , 2.8340199520471288e-8 , 8.747196034164517e-8 , 8.277125651545703e-9 , 1.1676293709683705e-8 , 1.4548514570833504e-7 , 7.200282148289716e-9 , 0.000002623600948936655 , 5.675736929333652e-7 , 0.0000019483527466945816 , 6.752595282932816e-8 , 8.168475318370838e-8 , 1.0933046468153407e-7 , 1.670913718498923e-7 , 3.1387276777650186e-8 , 2.973524537708272e-8 , 5.752163900751839e-8 , 5.850877471402782e-8 , 3.2544622285968217e-7 , 3.330221431951941e-8 , 4.186786668469722e-7 , 1.5085906568401697e-7 , 2.3346819943981245e-7 , 2.86402780602657e-7 , 2.2940319865938363e-7 , 1.8537603807544656e-7 , 3.151798182443599e-7 , 0.0000011075967449869495 , 1.5369782602192572e-7 , 1.9237509718550427e-7 , 1.64044664074936e-7 , 2.900835340824415e-7 , 1.246654903752642e-7 , 5.802622027317739e-8 , 5.186220519703966e-8 , 6.0094205167615655e-9 , 1.2333241272699524e-7 , 1.3798474185477971e-7 , 1.7370231830682314e-7 , 5.617761189569137e-7 , 5.1604470030497396e-8 , 4.813277598714194e-8 , 8.032698417537176e-8 , 0.0000020645263703045202 , 5.638597713186755e-7 , 8.794199857220519e-7 , 0.0000034785980460583232 , 2.972389268052211e-7 , 3.3904532870110415e-7 , 9.469074058188198e-8 , 3.754845678827223e-8 , 1.5679037801419327e-7 , 8.203105039683578e-8 , 6.847962641387539e-9 , 1.8251624211984563e-8 , 6.050240841659615e-8 , 3.956342808919544e-8 , 1.0699947949888156e-7 , 3.2566634899922065e-7 , 3.5369430406717584e-7 , 7.326295303755614e-8 , 4.85765610847011e-7 , 7.717713401689252e-7 , 3.4567779749750116e-8 , 3.246204585138912e-7 , 0.0000031608601602783892 , 5.33099466792919e-8 , 3.645687343123427e-7 , 5.48158936908294e-7 , 4.62306957160763e-8 , 1.3466177506415988e-7 , 4.3529482240955986e-8 , 1.6404105451783835e-7 , 2.463695381038633e-8 , 5.958712634424046e-8 , 9.493651020875404e-8 , 5.523462576206839e-8 , 5.7412357534758485e-8 , 0.000011850350347231142 , 0.0000058263944993086625 , 0.0000074208674050169066 , 9.127966222877149e-7 , 0.0000020019581370434025 , 0.000001033498961078294 , 3.5146850763112525e-8 , 0.000002058995278275688 , 3.5655509122989315e-7 , 6.873234070781109e-8 , 2.1935298022413008e-9 , 5.560363547374436e-8 , 3.3266996979364194e-7 , 1.307369217329324e-7 , 2.718762992515167e-8 , 1.0462929189714032e-8 , 7.466680358447775e-7 , 6.923166040451179e-8 , 1.6145664361033596e-8 , 8.568521003837759e-9 , 4.76221018175238e-9 , 1.233977116044116e-7 , 8.340628632197422e-9 , 3.2649041248333788e-9 , 5.0632489312363305e-9 , 4.0704994930251814e-9 , 1.2043538610839732e-8 , 5.105608380517879e-9 , 7.267142887457112e-9 , 1.184516307262129e-7 , 7.53557927168913e-8 , 6.386964201965384e-8 , 1.6212936770898523e-8 , 2.610429419291904e-7 , 6.979425393183192e-7 , 6.647513117741255e-8 , 7.717492849224072e-7 , 6.651206945207377e-7 , 3.324495310152997e-7 , 3.707282019149716e-7 , 3.99564243025452e-7 , 6.411632114122767e-8 , 7.107352217872176e-8 , 1.6380016631956096e-7 , 6.876800995314625e-8 , 3.462474467141874e-7 , 2.0256503319160402e-7 , 6.19610148078209e-7 , 2.6841073363925716e-8 , 6.720335363752383e-7 , 0.0000011348340649419697 , 0.0000018397931853542104 , 6.397251581802266e-7 , 7.257533241045167e-8 , 4.2213909523525217e-7 , 3.9657925299252383e-7 , 1.4037439655112394e-7 , 3.249856774800719e-7 , 1.5857655455420172e-7 , 1.1122217102865761e-7 , 7.391420808744442e-8 , 3.42322238111592e-7 , 5.39796154441774e-8 , 8.517296379295658e-8 , 0.000004061009803990601 , 0.000014478755474556237 , 7.317032757470088e-9 , 6.9484960008026064e-9 , 4.468917325084476e-8 , 9.23141172393116e-8 , 5.411982328951126e-8 , 2.2242811326123046e-7 , 1.7609554703312824e-8 , 2.0906279374344194e-8 , 3.6797682678724186e-9 , 6.177919686933819e-8 , 1.7920288541972695e-7 , 2.6279179721200308e-8 , 2.6988200119149042e-8 , 1.6432807115052128e-7 , 1.2827612749788386e-7 , 4.468908798571647e-8 , 6.316552969565237e-8 , 1.9461760203398626e-8 , 2.087125849925542e-8 , 2.2414580413965268e-8 , 2.4765244077684656e-8 , 6.785398465325443e-9 , 2.4248794971981624e-8 , 4.554979504689527e-9 , 2.8977037658250993e-8 , 2.0402325162649504e-8 , 1.600950270130852e-7 , 2.0199709638291097e-7 , 1.611188515937556e-8 , 5.964113825029926e-8 , 4.098318573397819e-9 , 3.9080127578472457e-8 , 7.511338218080255e-9 , 5.965624154669058e-7 , 1.6478223585636442e-7 , 1.4106989354445432e-8 , 3.2855584919389e-8 , 3.3387166364917675e-9 , 1.220043444050134e-8 , 4.624639160510924e-8 , 6.842309385746148e-9 , 1.74262879681919e-8 , 4.6611329906909305e-8 , 9.331947836699328e-8 , 1.2306078644996887e-7 , 1.2359445022980253e-8 , 1.1173199254699284e-8 , 2.7724862405875683e-8 , 2.419210147763806e-7 , 3.451186785241589e-7 , 2.593766978975509e-8 , 9.964568192799561e-8 , 9.797809674694236e-9 , 1.9085564417764544e-7 , 3.972706252852731e-8 , 2.6639204619982593e-8 , 6.874148805735558e-9 , 3.146993776681484e-8 , 2.4086594407890516e-7 , 1.3126927456141857e-7 , 2.1254339799270383e-7 , 2.050203384840188e-8 , 3.694976058454813e-8 , 6.563175816154398e-7 , 2.560050127442537e-8 , 2.6882981174480847e-8 , 6.880636078676616e-7 , 2.0092733166166e-7 , 2.788039665801989e-8 , 2.628409134786125e-8 , 5.1678345158734373e-8 , 1.8935413947929192e-7 , 4.61852835087484e-7 , 1.1086777718105623e-8 , 1.4542604276357451e-7 , 2.8737009216683873e-8 , 6.105167926762078e-7 , 1.2016463379893594e-8 , 1.3944705301582871e-7 , 2.093712758721722e-8 , 4.3801410498645055e-8 , 1.966320795077081e-8 , 6.654448991838535e-9 , 1.1149590584125235e-8 , 6.424939158478082e-8 , 6.971554888934861e-9 , 3.260019587614238e-9 , 1.4260189473702667e-8 , 2.7895078247297533e-8 , 8.11578289017234e-8 , 2.5995715802196173e-8 , 2.2855578762914774e-8 , 1.055962854934478e-7 , 8.145542551574181e-8 , 3.7793686402665116e-8 , 4.881891513264236e-8 , 2.342062366267328e-8 , 1.059935517133681e-8 , 3.604105103249822e-8 , 5.062430830093945e-8 , 3.6804440384230475e-8 , 1.501580193519203e-9 , 0.0000014475033367489232 , 0.000001076210423889279 , 1.304991315009829e-7 , 3.073601462233455e-8 , 1.7184021317007137e-8 , 2.0421090596300928e-8 , 7.904992216367646e-9 , 1.6902052379919041e-7 , 1.2416506933732308e-8 , 5.4758292122869534e-8 , 2.6250422280327257e-8 , 1.3261367115546818e-8 , 6.29807459517906e-8 , 1.270998595259698e-8 , 2.0171681569536304e-7 , 4.386637186826192e-8 , 6.962349630157405e-8 , 2.9565120485131047e-7 , 7.925131626507209e-7 , 2.0868920103112032e-7 , 1.7341794489311724e-7 , 4.2942417621816276e-8 , 4.213406956665722e-9 , 8.824785169281313e-8 , 1.7341569957807224e-8 , 7.321587247588468e-8 , 1.7941774288487977e-8 , 1.1245148101579616e-7 , 4.242405395871174e-7 , 8.259573469615589e-9 , 1.1336403105133286e-7 , 8.268798978861014e-8 , 2.2186977588489754e-8 , 1.9539720952366224e-8 , 1.0675703876472653e-8 , 3.288517547161973e-8 , 2.4340963022950746e-8 , 6.639137239972115e-8 , 5.604687380866835e-9 , 1.386604697728444e-8 , 6.675873720496384e-8 , 1.1355886009312144e-8 , 3.132159633878473e-7 , 3.12451788886392e-8 , 1.502181845580708e-7 , 1.3461754377885882e-8 , 1.8882955998833495e-7 , 4.645742279762999e-8 , 4.6453880742092224e-8 , 7.714453964524637e-9 , 3.5857155467056145e-8 , 7.60832108426257e-9 , 4.221501370693659e-8 , 4.3407251126836854e-9 , 1.340157496088068e-8 , 8.565600495558101e-8 , 1.7045413969185574e-8 , 5.4221903411644234e-8 , 3.021912675649219e-8 , 6.153376119755194e-8 , 3.938857240370908e-9 , 4.135628017820636e-8 , 1.781920389021252e-8 , 4.3105885083605244e-8 , 3.903354972578654e-9 , 7.663085455078544e-8 , 1.1890405993142394e-8 , 9.304217840622186e-9 , 1.0968062014171664e-9 , 1.0536767902635802e-8 , 1.1516804221400889e-7 , 8.134522886393825e-7 , 5.952623993721318e-8 , 2.806350174466843e-8 , 1.2833099027886874e-8 , 1.0605690192733164e-7 , 7.872949936427176e-7 , 2.7501393162765453e-8 , 3.936289072470345e-9 , 2.0519442145428002e-8 , 7.394815870753746e-9 , 3.598397313453461e-8 , 2.5378517065632877e-8 , 4.698972233541099e-8 , 7.54952989012736e-9 , 6.322805461422831e-7 , 5.582006412652163e-9 , 1.29640980617296e-7 , 1.5874988434916304e-8 , 3.3837810775594335e-8 , 6.474512037613067e-9 , 9.121148281110436e-8 , 1.3918511676536127e-8 , 8.230025549949005e-9 , 2.7061290097663004e-8 , 2.6095918315149902e-8 , 5.722363471960534e-9 , 6.963475698285038e-7 , 4.685091781198025e-8 , 9.590579885809802e-9 , 2.099205858030473e-7 , 3.082160660028421e-8 , 3.563162565001221e-8 , 7.326312925215461e-7 , 0.0000021759731225756695 , 2.407518309155421e-7 , 2.974515780351794e-7 , 2.529018416908002e-8 , 7.667950718825978e-9 , 2.663289251358947e-7 , 3.4358880185436647e-8 , 2.3130198201215535e-8 , 3.1239693498719134e-8 , 2.8691621878351725e-7 , 3.895845068768722e-8 , 2.4184130253956937e-8 , 1.1582445225144511e-8 , 5.1545349322168477e-8 , 2.034345492063494e-8 , 8.201963197507212e-8 , 1.164153573540716e-8 , 5.496356720868789e-7 , 1.1682151246361627e-8 , 4.7576914852243135e-8 , 1.6349824605299546e-8 , 4.090862759653646e-8 , 2.1271189609706198e-7 , 1.6697286753242224e-7 , 3.989708119433999e-8 , 0.000002852450279533514 , 1.2500372292834072e-7 , 2.4846613655427063e-7 , 1.245429093188477e-8 , 2.9700272463628608e-8 , 4.250991558762962e-9 , 1.61443480806156e-7 , 2.6386018703306036e-7 , 7.638056409575711e-9 , 3.4455793773702226e-9 , 7.273289526210647e-8 , 1.7631434090503717e-8 , 7.58661311550668e-9 , 2.1547013062672704e-8 , 1.2675349125856883e-7 , 2.5637149292379036e-8 , 3.500976220038865e-8 , 6.472243541111311e-8 , 8.387915251262257e-9 , 3.069512288789156e-8 , 7.520387867998579e-8 , 1.5724964441687916e-7 , 1.9634005354873807e-7 , 1.2290831818972947e-7 , 1.112118730439704e-9 , 1.546895944670723e-8 , 9.91701032404535e-9 , 6.882473257974198e-7 , 8.267616635748709e-8 , 4.469531234008173e-8 , 2.075201344098332e-8 , 8.649378457903367e-8 , 5.202766573120243e-8 , 4.5564942041664835e-8 , 2.0319955496006514e-8 , 8.705182352741758e-9 , 6.452066969586667e-8 , 2.1777438519166026e-8 , 1.030954166481024e-8 , 3.211904342492744e-8 , 2.3336936294526822e-7 , 8.054096056753224e-9 , 1.9623354319264763e-7 , 1.2888089884199871e-7 , 1.5392496166555247e-8 , 1.401903038100727e-9 , 5.696818305978013e-8 , 6.080025372057207e-9 , 1.0782793324892737e-8 , 2.4260730313585555e-8 , 1.9388659566743627e-8 , 2.2970310453729326e-7 , 1.9971754028347277e-8 , 2.8477993296860404e-8 , 5.2273552597625894e-8 , 2.7392806600801123e-7 , 9.857291161097237e-8 , 3.12910977129377e-8 , 4.151442212219081e-8 , 5.251196366629074e-9 , 0.000001580681100676884 , 8.547603442821128e-7 , 1.068913135782168e-8 , 0.0000010621830597301596 , 7.737313012512459e-8 , 6.394216711669287e-8 , 1.1698345758759388e-7 , 1.0486609625104393e-7 , 2.1161000063329993e-7 , 1.53396815250062e-8 , 5.094453570109181e-8 , 1.4005379966874898e-8 , 2.6282036102998063e-8 , 8.778433624456738e-8 , 7.772066545896905e-9 , 4.228875383205377e-8 , 3.3243779284930497e-7 , 7.729244799747903e-8 , 7.636901111496286e-10 , 5.989500806435899e-8 , 1.326090597331131e-7 , 1.2853634245857393e-7 , 8.844242671557367e-9 , 1.0194374766570036e-7 , 2.493779334145074e-7 , 1.6547971881664125e-7 , 1.1762754326127833e-8 , 1.1496195639892903e-7 , 2.9342709240154363e-7 , 1.326124099421122e-8 , 8.630262726683213e-8 , 5.7394842656322e-8 , 1.1094081031615133e-7 , 2.2933713239581266e-7 , 3.4706170026765903e-7 , 1.4751107357824367e-7 , 1.502495017291494e-8 , 6.454319390059027e-8 , 5.164533689594464e-8 , 6.23741556182722e-8 , 1.293601457064142e-7 , 1.4052071506398534e-8 , 5.386946000385251e-8 , 2.0827554791935654e-8 , 1.3040637902861363e-8 , 1.0578981601838677e-7 , 1.5079727688771527e-8 , 8.92632726845477e-7 , 4.6374381668101705e-8 , 7.481006036869076e-7 , 5.883147302654379e-9 , 2.8707685117979054e-9 , 8.381598490814213e-7 , 7.341958596640552e-9 , 1.4245998158912698e-8 , 1.0926417104428765e-7 , 1.1308178216040687e-7 , 2.52339901862797e-7 , 1.1782835684925885e-7 , 4.6678056975224536e-8 , 2.7959197179683315e-9 , 3.4363861090014325e-8 , 1.4674496640054713e-7 , 3.5396915620822256e-8 , 2.0581127557761647e-7 , 7.18387909159901e-8 , 2.7693943138729082e-8 , 4.5493386835460115e-8 , 1.9559182717898693e-8 , 1.5359708172013598e-8 , 1.2336623278486059e-8 , 2.9570605519779747e-8 , 2.877552560676122e-7 , 9.051845495378075e-7 , 2.3732602016934834e-7 , 1.6521676471370483e-8 , 1.5478875070584763e-8 , 3.526786329643983e-8 , 3.616410637619083e-8 , 1.61590953950963e-8 , 7.65007328595857e-8 , 1.9661483108279754e-8 , 4.917534823789538e-8 , 1.1712612746350715e-7 , 1.0889253054813253e-8 , 0.000001494120169809321 , 1.018585660261806e-8 , 3.7575969003000864e-8 , 2.097097784314883e-8 , 3.368558054717141e-8 , 4.845588819080149e-9 , 6.039624622644624e-7 , 1.037331109898787e-8 , 2.841650257323636e-7 , 4.4990630954089283e-7 , 3.463186004637464e-8 , 7.720684180867465e-8 , 1.471122175189521e-7 , 1.1601575522490748e-7 , 4.007488030310924e-7 , 3.025649775167949e-8 , 6.706784461130155e-8 , 2.0128741340386114e-8 , 1.5987744461654074e-9 , 4.1919822280078733e-8 , 1.3167154477855547e-8 , 3.231814815762846e-8 , 9.247659704669786e-8 , 1.3075300842047e-7 , 1.0574301256838226e-7 , 3.762165334819656e-8 , 1.0942246575496029e-7 , 7.001474955359299e-8 , 2.742706151082075e-8 , 2.0766625752344225e-8 , 4.5403403703403455e-8 , 3.39040298058535e-8 , 1.0469661759771043e-7 , 2.8271578855765256e-8 , 3.406226767310727e-7 , 5.146206945028098e-7 , 6.740708613506285e-7 , 6.382248063374618e-9 , 3.63878704945364e-8 , 3.626059807970705e-8 , 1.6065602892467723e-7 , 3.639055989879125e-7 , 6.232691696084203e-9 , 4.805490050330263e-8 , 3.372633727849461e-8 , 6.328880317596486e-7 , 6.480631498106959e-8 , 2.1165197949812864e-7 , 8.38779143919055e-8 , 1.7589144363228115e-8 , 2.729027670511641e-9 , 2.144795097080987e-8 , 7.861271456022223e-8 , 2.0118186228046397e-8 , 2.8407685093156942e-8 , 2.4922530883486615e-7 , 2.0156670998972004e-8 , 2.6551649767725394e-8 , 2.7848242822869906e-8 , 6.907123761834555e-9 , 1.880543720744754e-8 , 1.3006903998302732e-8 , 3.685918272822164e-7 , 3.967941211158177e-7 , 2.7592133022835696e-8 , 2.5228947819755376e-8 , 1.547002881352455e-7 , 3.689306637966183e-8 , 1.440177199718562e-9 , 2.1504929392790473e-8 , 5.068111263994979e-8 , 5.081711407228795e-8 , 1.171875219085905e-8 , 5.409278358570191e-8 , 7.138276600926474e-7 , 2.5237213208129106e-7 , 7.072044638789521e-8 , 7.199763984999663e-8 , 1.2525473103153217e-8 , 3.4803417747752974e-7 , 1.9591827538079087e-7 , 1.2404700555634918e-7 , 1.234617457157583e-7 , 1.9201337408958352e-8 , 1.9895249181445251e-7 , 3.7876677794201896e-8 , 1.0629785052174157e-8 , 1.2437127772102485e-8 , 2.1861892207653e-7 , 2.6181456291851646e-7 , 1.112900775979142e-7 , 1.0776630432474121e-7 , 6.380325157095967e-9 , 3.895085143312826e-9 , 1.5762756788717525e-7 , 2.909027019271093e-9 , 1.0381050685737137e-8 , 2.8135211493918177e-8 , 1.0778002490496874e-8 , 1.3605974125141529e-8 , 2.9236465692861202e-8 , 1.9189795352758665e-7 , 2.199506354827463e-7 , 1.326399790002597e-8 , 4.9004846403022384e-8 , 2.980837132682268e-9 , 8.926045680368588e-9 , 1.0996975774446582e-8 , 7.71560149104289e-9 , 7.454491246505768e-9 , 5.086162246925596e-8 , 1.5129764108223753e-7 , 1.1960075596562092e-8 , 1.1323334270230134e-8 , 9.391332156383214e-9 , 9.585701832293125e-8 , 1.905532798218701e-8 , 1.8105303922766325e-8 , 6.179227796110354e-8 , 6.389401363549041e-8 , 1.1853179771037503e-8 , 9.37277544466042e-9 , 1.2332148457971925e-7 , 1.6522022860954166e-8 , 1.246116454467483e-7 , 4.196171854431441e-9 , 3.996593278543514e-8 , 1.2554556505506298e-8 , 1.4302138140465104e-8 , 6.631793780798034e-9 , 5.964224669696705e-9 , 5.556936244488497e-9 , 1.4192455921602232e-7 , 1.7613080771639034e-8 , 3.380189639301534e-7 , 7.85651934620546e-8 , 2.966783085867064e-8 , 0.0000028992105853831163 , 0.0000013787366697215475 , 5.313622430946907e-9 , 2.512852859126724e-8 , 8.406627216572815e-8 , 4.492839167369311e-8 , 5.408793057881667e-8 , 2.4239175999696272e-8 , 4.016805235096399e-7 , 4.1083545454512205e-8 , 5.4153481698904216e-8 , 8.640767212853007e-9 , 5.773256717134245e-8 , 2.6443152023603034e-7 , 8.953217047746875e-7 , 2.7994001783326894e-8 , 5.889480014786841e-9 , 4.1788819515886644e-8 , 2.8880645430717777e-8 , 2.135752907861388e-8 , 2.3024175277441827e-7 , 8.786625471657317e-8 , 2.0697297209437693e-9 , 2.236410523437371e-8 , 3.203276310870251e-9 , 1.176874686592555e-8 , 6.963571053120177e-8 , 2.271932153519174e-8 , 7.360382525689602e-9 , 6.922528772435044e-9 , 3.213871480056696e-8 , 1.370577820125618e-7 , 1.9815049157045905e-8 , 1.0578956377571558e-8 , 2.7049420481262132e-8 , 2.9755937713815683e-9 , 2.1773699288019088e-8 , 1.09755387001087e-8 , 1.991872444762066e-8 , 2.3882098076910552e-8 , 2.1357365653784655e-8 , 6.109098560358461e-9 , 1.1890497475519624e-8 , 1.1459891702259029e-8 , 3.73173456580389e-8 , 1.572620256240498e-8 , 3.404023374287135e-8 , 3.6921580459647885e-8 , 9.281765045443535e-8 , 1.2323201303843234e-7 , 4.2347593876002065e-8 , 1.7423728237986325e-8 , 5.8113389656000436e-8 , 3.931436154402945e-8 , 2.3690461148362374e-8 , 1.792850135018398e-8 , 1.440664210150544e-8 , 7.019830494670032e-9 , 6.041522482291839e-8 , 4.867479930226182e-8 , 1.0685319296044327e-8 , 1.0051243393149889e-8 , 4.2426261614991745e-8 , 2.607815297039906e-8 , 5.136670200300841e-9 , 1.69729952315123e-9 , 1.9131586981302462e-8 , 2.111743526711507e-7 , 1.337269672774255e-8 , 2.0002481448955223e-8 , 1.0454256482717028e-7 , 2.8144228281234973e-8 , 2.1344791889532644e-7 , 2.1046110632028103e-8 , 1.9114453664315079e-7 , 3.957693550660224e-8 , 2.931631826186276e-8 , 1.105203111251285e-7 , 4.84007678380749e-8 , 5.583606110803885e-8 , 1.2130111315400427e-7 , 1.77621615193857e-8 , 2.5610853882085394e-8 , 1.203865309662433e-7 , 4.674859610531712e-9 , 1.5916098661250544e-8 , 3.147594185293201e-8 , 6.147686093527227e-8 , 2.204641802450169e-8 , 3.257763410147163e-7 , 1.198914532096751e-7 , 2.3818989802748547e-7 , 1.4909986134625797e-8 , 5.10168831624469e-8 , 5.5142201915714395e-8 , 2.288550327023131e-8 , 5.714110073995471e-8 , 5.185095801607531e-7 , 4.977285783525076e-8 , 1.1049896109227575e-8 , 1.264099296349741e-7 , 8.174881571676451e-8 ]}]} Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f paddle-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = jay-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can use the example payload jay-v2-grpc.json as the sample input to test the model. Notice that the input format differs from the in the previous REST endpoint example. ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Tue, 10 Oct 2023 14 :55:27 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 190 Estimated response size: 4093 bytes Response contents: { \"modelName\" : \"paddle-v2-resnet50-grpc\" , \"id\" : \"97db0c56-95d2-4171-afd5-f7609a87032d\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"1000\" ] , \"contents\" : { \"fp32Contents\" : [ 6 .7366788e-9,1.1535991e-8,5.1422507e-8,6.6471706e-8,4.0944926e-8,1.3402452e-7,9.355561e-8,2.8935892e-8,6.845367e-8,7.680616e-8,0.000002033469,0.0000011085679,2.3477592e-7,6.582037e-7,0.00012373104,4.2878804e-7,0.00000641996,0.99934965,0.000073720024,0.000031011357,0.0000056028093,0.0000021862509,1.9544045e-8,3.728894e-7,4.2903633e-7,1.825118e-7,7.159926e-8,9.231618e-9,6.4692415e-7,7.0316903e-9,4.4512316e-8,1.2455972e-7,9.4463275e-8,4.3477044e-8,4.65822e-7,6.797721e-8,2.1060276e-7,2.2605123e-8,1.431149e-7,7.9512986e-8,1.2341783e-7,0.0000010921714,0.000015243892,3.1173343e-7,2.4152058e-7,6.8637625e-8,8.467682e-8,9.424677e-8,1.021921e-8,3.3770753e-8,3.6928835e-8,1.3694032e-7,1.06742846e-7,2.5994837e-7,3.4866406e-7,3.1320535e-8,3.5748732e-7,6.648439e-8,3.1638956e-7,0.0000012095878,8.66409e-8,4.0144172e-8,1.2544761e-7,3.320118e-8,1.9731445e-7,3.8064056e-7,1.3827865e-7,2.300226e-8,7.144225e-8,2.8511145e-8,2.9825674e-8,8.936033e-8,6.2238837e-7,6.478839e-8,1.3663023e-7,9.9731814e-8,2.5761555e-8,4.13022e-8,3.9384464e-8,1.215808e-7,0.0000043028217,0.0000018179063,0.0000018520155,0.0000016246107,0.000016448314,0.000010544916,0.000003993062,2.6464798e-7,0.000019193476,4.803243e-7,1.696285e-7,0.0000045505058,0.000042359294,0.0000044433386,0.0000051040097,0.0000135063965,4.1758724e-7,4.4944915e-7,3.1566984e-7,0.00000105576,1.3364639e-8,1.389366e-8,6.7703795e-8,1.4129697e-7,7.170519e-8,7.9344666e-8,2.6391543e-8,2.6134321e-8,7.196726e-9,2.1752363e-8,6.68464e-8,3.417796e-8,1.6228276e-7,4.1071146e-7,6.4721354e-7,2.9513794e-7,5.653474e-9,4.8301445e-8,8.887482e-9,3.730617e-8,1.7784265e-8,4.641905e-9,3.4131187e-8,1.9373938e-7,0.0000012980177,3.5641005e-8,2.1493324e-8,3.0552937e-7,1.5532517e-7,0.0000014520979,3.4884646e-8,0.000038254384,4.5088433e-7,4.176697e-7,6.7706225e-7,1.4142249e-7,0.000014235998,6.2938204e-7,0.000004762867,9.0249006e-7,9.058988e-7,0.0000015713684,1.5720647e-7,1.8185365e-7,7.1931886e-8,0.0000011952825,8.8748374e-7,2.0870831e-7,9.906239e-8,7.793622e-9,1.00584984e-7,4.205944e-7,1.8436246e-7,1.6437947e-7,7.025353e-8,2.5704486e-7,7.5868776e-8,7.841314e-7,2.4953093e-7,5.157682e-8,4.0674127e-8,7.5317965e-9,4.7974854e-8,1.7419973e-8,1.7958679e-7,1.2566392e-8,8.97544e-8,3.2696548e-8,1.120836e-7,3.9067462e-8,4.6769046e-8,1.8523554e-7,1.4833053e-7,1.227935e-7,0.0000010729105,3.653849e-9,1.6198403e-7,1.619072e-8,1.2004934e-7,1.48002774e-8,4.0229484e-8,2.150609e-7,1.1925697e-7,4.8982514e-8,7.60892e-8,2.313748e-8,8.52105e-8,9.5862134e-8,1.3351651e-7,3.021699e-8,4.4238764e-8,2.610667e-8,2.397709e-7,1.3192565e-7,1.6734932e-8,1.588337e-7,4.0643516e-7,8.7534545e-8,8.3669994e-7,3.4375987e-8,7.847893e-8,8.526395e-9,9.601383e-8,5.258924e-7,1.3557448e-7,1.0307227e-7,1.04298135e-8,5.1877144e-8,2.1870013e-8,1.179144e-8,2.9806564e-8,4.3383935e-8,2.9991046e-8,2.850761e-8,3.058665e-8,6.441099e-8,1.5364101e-8,1.5973884e-8,2.573685e-8,1.0903766e-7,3.2118738e-8,6.819743e-9,1.9251311e-7,5.825811e-8,1.8765762e-7,4.007079e-7,1.5791578e-8,1.9501584e-7,1.0142063e-8,2.744815e-8,1.2843532e-8,3.7297493e-8,7.407497e-8,4.2060783e-8,1.6924805e-8,1.4592035e-7,4.344977e-8,1.7191404e-7,3.5817443e-8,8.440249e-9,4.1948297e-8,2.5140324e-8,2.83402e-8,8.747196e-8,8.277126e-9,1.1676294e-8,1.4548515e-7,7.200282e-9,0.000002623601,5.675737e-7,0.0000019483527,6.752595e-8,8.168475e-8,1.09330465e-7,1.6709137e-7,3.1387277e-8,2.9735245e-8,5.752164e-8,5.8508775e-8,3.2544622e-7,3.3302214e-8,4.1867867e-7,1.5085907e-7,2.334682e-7,2.8640278e-7,2.294032e-7,1.8537604e-7,3.1517982e-7,0.0000011075967,1.5369783e-7,1.923751e-7, 1 .6404466e-7,2.9008353e-7,1.2466549e-7,5.802622e-8,5.1862205e-8,6.0094205e-9,1.2333241e-7,1.3798474e-7,1.7370232e-7,5.617761e-7,5.160447e-8,4.8132776e-8,8.0326984e-8,0.0000020645264,5.6385977e-7,8.7942e-7,0.000003478598,2.9723893e-7,3.3904533e-7,9.469074e-8,3.7548457e-8,1.5679038e-7,8.203105e-8,6.8479626e-9,1.8251624e-8,6.050241e-8,3.9563428e-8,1.0699948e-7,3.2566635e-7,3.536943e-7,7.326295e-8,4.857656e-7,7.7177134e-7,3.456778e-8,3.2462046e-7,0.0000031608602,5.3309947e-8,3.6456873e-7,5.4815894e-7,4.6230696e-8,1.3466178e-7,4.3529482e-8,1.6404105e-7,2.4636954e-8,5.9587126e-8,9.493651e-8,5.5234626e-8,5.7412358e-8,0.00001185035,0.0000058263945,0.0000074208674,9.127966e-7,0.0000020019581,0.000001033499,3.514685e-8,0.0000020589953,3.565551e-7,6.873234e-8,2.1935298e-9,5.5603635e-8,3.3266997e-7,1.3073692e-7,2.718763e-8,1.0462929e-8,7.4666804e-7,6.923166e-8,1.6145664e-8,8.568521e-9,4.76221e-9,1.2339771e-7,8.340629e-9,3.2649041e-9,5.063249e-9,4.0704995e-9,1.2043539e-8,5.1056084e-9,7.267143e-9,1.1845163e-7,7.535579e-8,6.386964e-8,1.6212937e-8,2.6104294e-7,6.9794254e-7,6.647513e-8,7.717493e-7,6.651207e-7,3.3244953e-7,3.707282e-7,3.9956424e-7,6.411632e-8,7.107352e-8,1.6380017e-7,6.876801e-8,3.4624745e-7,2.0256503e-7,6.1961015e-7,2.6841073e-8,6.7203354e-7,0.0000011348341,0.0000018397932,6.3972516e-7,7.257533e-8,4.221391e-7,3.9657925e-7,1.403744e-7,3.2498568e-7,1.5857655e-7,1.1122217e-7,7.391421e-8,3.4232224e-7,5.3979615e-8,8.5172964e-8,0.00000406101,0.0000144787555,7.3170328e-9,6.948496e-9,4.4689173e-8,9.231412e-8,5.4119823e-8,2.2242811e-7,1.7609555e-8,2.090628e-8,3.6797683e-9,6.17792e-8,1.7920289e-7,2.627918e-8,2.69882e-8,1.6432807e-7,1.2827613e-7,4.4689088e-8,6.316553e-8,1.946176e-8,2.0871258e-8,2.241458e-8,2.4765244e-8,6.7853985e-9,2.4248795e-8,4.5549795e-9,2.8977038e-8,2.0402325e-8,1.6009503e-7,2.019971e-7,1.6111885e-8,5.964114e-8,4.0983186e-9,3.9080128e-8,7.511338e-9,5.965624e-7,1.6478224e-7,1.4106989e-8,3.2855585e-8,3.3387166e-9,1.2200434e-8,4.624639e-8,6.8423094e-9,1.7426288e-8,4.661133e-8,9.331948e-8,1.2306079e-7,1.2359445e-8,1.1173199e-8,2.7724862e-8,2.4192101e-7,3.4511868e-7,2.593767e-8,9.964568e-8,9.79781e-9,1.9085564e-7,3.9727063e-8,2.6639205e-8,6.874149e-9,3.1469938e-8,2.4086594e-7,1.3126927e-7,2.125434e-7,2.0502034e-8,3.694976e-8,6.563176e-7,2.5600501e-8,2.6882981e-8,6.880636e-7,2.0092733e-7,2.7880397e-8,2.6284091e-8,5.1678345e-8,1.8935414e-7,4.6185284e-7,1.1086778e-8,1.4542604e-7,2.873701e-8,6.105168e-7,1.2016463e-8,1.3944705e-7,2.0937128e-8,4.380141e-8,1.9663208e-8,6.654449e-9,1.1149591e-8,6.424939e-8,6.971555e-9,3.2600196e-9,1.42601895e-8,2.7895078e-8,8.115783e-8,2.5995716e-8,2.2855579e-8,1.05596285e-7,8.1455426e-8,3.7793686e-8,4.8818915e-8,2.3420624e-8,1.0599355e-8,3.604105e-8,5.062431e-8,3.680444e-8,1.5015802e-9,0.0000014475033,0.0000010762104,1.3049913e-7,3.0736015e-8,1.7184021e-8,2.042109e-8,7.904992e-9,1.6902052e-7,1.2416507e-8,5.4758292e-8,2.6250422e-8,1.3261367e-8,6.2980746e-8,1.2709986e-8,2.0171682e-7,4.3866372e-8,6.9623496e-8,2.956512e-7,7.9251316e-7,2.086892e-7,1.7341794e-7,4.2942418e-8,4.213407e-9,8.824785e-8,1.734157e-8,7.321587e-8,1.7941774e-8,1.1245148e-7,4.2424054e-7,8.2595735e-9,1.1336403e-7,8.268799e-8,2.2186978e-8,1.9539721e-8,1.0675704e-8,3.2885175e-8,2.4340963e-8,6.639137e-8,5.6046874e-9,1.3866047e-8,6.675874e-8,1.1355886e-8,3.1321596e-7,3.124518e-8,1.5021818e-7,1.3461754e-8,1.8882956e-7,4.6457423e-8,4.645388e-8,7.714454e-9,3.5857155e-8,7.608321e-9,4.2215014e-8,4.340725e-9,1.3401575e-8,8.5656005e-8,1.7045414e-8,5.4221903e-8,3.0219127e-8,6.153376e-8,3.9388572e-9, 4 .135628e-8,1.7819204e-8,4.3105885e-8,3.903355e-9,7.6630855e-8,1.1890406e-8,9.304218e-9,1.0968062e-9,1.0536768e-8,1.1516804e-7,8.134523e-7,5.952624e-8,2.8063502e-8,1.2833099e-8,1.060569e-7,7.87295e-7,2.7501393e-8,3.936289e-9,2.0519442e-8,7.394816e-9,3.5983973e-8,2.5378517e-8,4.6989722e-8,7.54953e-9,6.3228055e-7,5.5820064e-9,1.2964098e-7,1.5874988e-8,3.383781e-8,6.474512e-9,9.121148e-8,1.3918512e-8,8.2300255e-9,2.706129e-8,2.6095918e-8,5.7223635e-9,6.9634757e-7,4.6850918e-8,9.59058e-9,2.0992059e-7,3.0821607e-8,3.5631626e-8,7.326313e-7,0.0000021759731,2.4075183e-7,2.9745158e-7,2.5290184e-8,7.667951e-9,2.6632893e-7,3.435888e-8,2.3130198e-8,3.1239693e-8,2.8691622e-7,3.895845e-8,2.418413e-8,1.1582445e-8,5.154535e-8,2.0343455e-8,8.201963e-8,1.1641536e-8,5.496357e-7,1.1682151e-8,4.7576915e-8,1.6349825e-8,4.0908628e-8,2.127119e-7,1.6697287e-7,3.989708e-8,0.0000028524503,1.2500372e-7,2.4846614e-7,1.2454291e-8,2.9700272e-8,4.2509916e-9,1.6144348e-7,2.638602e-7,7.638056e-9,3.4455794e-9,7.2732895e-8,1.7631434e-8,7.586613e-9,2.1547013e-8,1.2675349e-7,2.563715e-8,3.5009762e-8,6.4722435e-8,8.387915e-9,3.0695123e-8,7.520388e-8,1.5724964e-7,1.9634005e-7,1.2290832e-7,1.1121187e-9,1.546896e-8,9.91701e-9,6.882473e-7,8.2676166e-8,4.4695312e-8,2.0752013e-8,8.6493785e-8,5.2027666e-8,4.5564942e-8,2.0319955e-8,8.705182e-9,6.452067e-8,2.1777439e-8,1.0309542e-8,3.2119043e-8,2.3336936e-7,8.054096e-9,1.9623354e-7,1.288809e-7,1.5392496e-8,1.401903e-9,5.6968183e-8,6.0800254e-9,1.0782793e-8,2.426073e-8,1.938866e-8,2.297031e-7,1.9971754e-8,2.8477993e-8,5.2273553e-8,2.7392807e-7,9.857291e-8,3.1291098e-8,4.1514422e-8,5.2511964e-9,0.0000015806811,8.5476034e-7,1.0689131e-8,0.0000010621831,7.737313e-8,6.394217e-8,1.1698346e-7,1.04866096e-7,2.1161e-7,1.5339682e-8,5.0944536e-8,1.400538e-8,2.6282036e-8,8.7784336e-8,7.7720665e-9,4.2288754e-8,3.324378e-7,7.729245e-8,7.636901e-10,5.989501e-8,1.3260906e-7,1.2853634e-7,8.844243e-9,1.0194375e-7,2.4937793e-7,1.6547972e-7,1.1762754e-8,1.14961956e-7,2.934271e-7,1.3261241e-8,8.630263e-8,5.7394843e-8,1.1094081e-7,2.2933713e-7,3.470617e-7,1.4751107e-7,1.502495e-8,6.4543194e-8,5.1645337e-8,6.2374156e-8,1.2936015e-7,1.40520715e-8,5.386946e-8,2.0827555e-8,1.3040638e-8,1.05789816e-7,1.5079728e-8,8.926327e-7,4.637438e-8,7.481006e-7,5.8831473e-9,2.8707685e-9,8.3815985e-7,7.3419586e-9,1.4245998e-8,1.0926417e-7,1.1308178e-7,2.523399e-7,1.1782836e-7,4.6678057e-8,2.7959197e-9,3.436386e-8,1.4674497e-7,3.5396916e-8,2.0581128e-7,7.183879e-8,2.7693943e-8,4.5493387e-8,1.9559183e-8,1.5359708e-8,1.2336623e-8,2.9570606e-8,2.8775526e-7,9.0518455e-7,2.3732602e-7,1.6521676e-8,1.5478875e-8,3.5267863e-8,3.6164106e-8,1.6159095e-8,7.650073e-8,1.9661483e-8,4.917535e-8,1.1712613e-7,1.0889253e-8,0.0000014941202,1.0185857e-8,3.757597e-8,2.0970978e-8,3.368558e-8,4.845589e-9,6.0396246e-7,1.0373311e-8,2.8416503e-7,4.499063e-7,3.463186e-8,7.720684e-8,1.4711222e-7,1.16015755e-7,4.007488e-7,3.0256498e-8,6.7067845e-8,2.0128741e-8,1.5987744e-9,4.1919822e-8,1.31671545e-8,3.2318148e-8,9.24766e-8,1.3075301e-7,1.0574301e-7,3.7621653e-8,1.09422466e-7,7.001475e-8,2.7427062e-8,2.0766626e-8,4.5403404e-8,3.390403e-8,1.0469662e-7,2.8271579e-8,3.4062268e-7,5.146207e-7,6.7407086e-7,6.382248e-9,3.638787e-8,3.6260598e-8,1.6065603e-7,3.639056e-7,6.2326917e-9,4.80549e-8,3.3726337e-8,6.3288803e-7,6.4806315e-8,2.1165198e-7,8.3877914e-8,1.7589144e-8,2.7290277e-9,2.1447951e-8,7.8612715e-8,2.0118186e-8,2.8407685e-8,2.492253e-7,2.0156671e-8,2.655165e-8,2.7848243e-8,6.9071238e-9,1.8805437e-8,1.3006904e-8,3.6859183e-7,3.9679412e-7,2.7592133e-8,2.5228948e-8,1.5470029e-7,3.6893066e-8,1.4401772e-9,2.150493e-8,5.0681113e-8,5.0817114e-8,1.1718752e-8,5.4092784e-8,7.1382766e-7,2.5237213e-7,7.0720446e-8,7.199764e-8,1.2525473e-8,3.4803418e-7,1.9591828e-7,1.24047e-7,1.2346175e-7,1.9201337e-8,1.9895249e-7,3.7876678e-8,1.0629785e-8,1.2437128e-8,2.1861892e-7,2.6181456e-7,1.1129008e-7,1.07766304e-7,6.380325e-9,3.895085e-9,1.5762757e-7,2.909027e-9,1.0381051e-8,2.8135211e-8,1.07780025e-8,1.3605974e-8,2.9236466e-8,1.9189795e-7,2.1995064e-7,1.3263998e-8,4.9004846e-8,2.9808371e-9,8.926046e-9,1.0996976e-8,7.7156015e-9,7.454491e-9,5.0861622e-8,1.5129764e-7,1.1960076e-8,1.1323334e-8,9.391332e-9,9.585702e-8,1.9055328e-8,1.8105304e-8,6.179228e-8,6.3894014e-8,1.185318e-8,9.3727754e-9,1.2332148e-7,1.6522023e-8,1.2461165e-7,4.196172e-9,3.9965933e-8,1.25545565e-8,1.4302138e-8,6.631794e-9,5.9642247e-9,5.5569362e-9,1.4192456e-7,1.761308e-8,3.3801896e-7,7.856519e-8,2.966783e-8,0.0000028992106,0.0000013787367,5.3136224e-9,2.5128529e-8,8.406627e-8,4.492839e-8,5.408793e-8,2.4239176e-8,4.0168052e-7,4.1083545e-8,5.415348e-8,8.640767e-9,5.7732567e-8,2.6443152e-7,8.953217e-7,2.7994002e-8,5.88948e-9,4.178882e-8,2.8880645e-8,2.1357529e-8,2.3024175e-7,8.7866255e-8,2.0697297e-9,2.2364105e-8,3.2032763e-9,1.1768747e-8,6.963571e-8,2.2719322e-8,7.3603825e-9,6.9225288e-9,3.2138715e-8,1.3705778e-7,1.981505e-8,1.0578956e-8,2.704942e-8,2.9755938e-9,2.17737e-8,1.0975539e-8,1.9918724e-8,2.3882098e-8,2.1357366e-8,6.1090986e-9,1.18904975e-8,1.1459892e-8,3.7317346e-8,1.5726203e-8,3.4040234e-8,3.692158e-8,9.281765e-8,1.2323201e-7,4.2347594e-8,1.7423728e-8,5.811339e-8,3.931436e-8,2.3690461e-8,1.7928501e-8,1.4406642e-8,7.0198305e-9,6.0415225e-8,4.86748e-8,1.0685319e-8,1.0051243e-8,4.242626e-8,2.6078153e-8,5.13667e-9,1.6972995e-9,1.9131587e-8,2.1117435e-7,1.3372697e-8,2.0002481e-8,1.04542565e-7,2.8144228e-8,2.1344792e-7,2.104611e-8,1.9114454e-7,3.9576936e-8,2.9316318e-8,1.1052031e-7,4.8400768e-8,5.583606e-8,1.2130111e-7,1.7762162e-8,2.5610854e-8,1.2038653e-7,4.6748596e-9,1.5916099e-8,3.1475942e-8,6.147686e-8,2.2046418e-8,3.2577634e-7,1.1989145e-7,2.381899e-7,1.4909986e-8,5.1016883e-8,5.5142202e-8,2.2885503e-8,5.71411e-8,5.185096e-7,4.9772858e-8,1.1049896e-8,1.2640993e-7,8.1748816e-8 ]} } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Paddle"},{"location":"modelserving/v1beta1/paddle/#deploy-paddle-model-with-inferenceservice","text":"In this example, we use a trained paddle resnet50 model to classify images by running an inference service with Paddle predictor.","title":"Deploy Paddle model with InferenceService"},{"location":"modelserving/v1beta1/paddle/#deploy-paddle-model-with-v1-protocol","text":"","title":"Deploy Paddle model with V1 protocol"},{"location":"modelserving/v1beta1/paddle/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : model : modelFormat : name : paddle storageUri : \"gs://kfserving-examples/models/paddle/resnet\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : paddle : storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the above yaml to create the InferenceService kubectl apply -f paddle.yaml Expected Output $ inferenceservice.serving.kserve.io/paddle-resnet50 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/paddle/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload jay.json as the sample input to test the model. MODEL_NAME = paddle-resnet50 SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Expected Output * Trying 127 .0.0.1:80... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 80 ( #0) > POST /v1/models/paddle-resnet50:predict HTTP/1.1 > Host: paddle-resnet50.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3010209 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23399 < content-type: application/json ; charset = UTF-8 < date: Mon, 17 May 2021 03 :34:58 GMT < server: istio-envoy < x-envoy-upstream-service-time: 511 < { \"predictions\" : [[ 6 .736678770380422e-09, 1 .1535990829258935e-08, 5 .142250714129659e-08, 6 .647170636142619e-08, 4 .094492567219277e-08, 1 .3402451770616608e-07, 9 .355561303436843e-08, 2 .8935891904779965e-08, 6 .845367295227334e-08, 7 .680615965455218e-08, 2 .0334689452283783e-06, 1 .1085678579547675e-06, 2 .3477592492326949e-07, 6 .582037030966603e-07, 0 .00012373103527352214, 4 .2878804151769145e-07, 6 .419959845516132e-06, 0 .9993496537208557, 7 .372002437477931e-05, 3 .101135735050775e-05, 5 .6028093240456656e-06, 2 .1862508674530545e-06, 1 .9544044604913324e-08, 3 .728893887000595e-07, 4 .2903633357127546e-07, 1 .8251179767503345e-07, 7 .159925985433802e-08, 9 .231618136595898e-09, 6 .469241498052725e-07, 7 .031690341108288e-09, 4 .451231561120039e-08, 1 .2455971898361895e-07, 9 .44632745358831e-08, 4 .347704418705689e-08, 4 .658220120745682e-07, 6 .797721141538204e-08, 2 .1060276367279585e-07, 2 .2605123106700376e-08, 1 .4311490303953178e-07, 7 .951298641728499e-08, 1 .2341783417468832e-07, 1 .0921713737843675e-06, 1 .5243892448779661e-05, 3 .1173343018053856e-07, 2 .4152058131221565e-07, 6 .863762536113427e-08, 8 .467682022228473e-08, 9 .4246772164297e-08, 1 .0219210366813058e-08, 3 .3770753304906975e-08, 3 .6928835100979995e-08, 1 .3694031508748594e-07, 1 .0674284567357972e-07, 2 .599483650556067e-07, 3 .4866405940192635e-07, 3 .132053549848024e-08, 3 .574873232992104e-07, 6 .64843895492595e-08, 3 .1638955988455564e-07, 1 .2095878219042788e-06, 8 .66409024524728e-08, 4 .0144172430700564e-08, 1 .2544761318622477e-07, 3 .3201178695208e-08, 1 .9731444922399533e-07, 3 .806405572959193e-07, 1 .3827865075199952e-07, 2 .300225965257141e-08, 7 .14422512260171e-08, 2 .851114544455413e-08, 2 .982567437470607e-08, 8 .936032713791064e-08, 6 .22388370175031e-07, 6 .478838798784636e-08, 1 .3663023423760023e-07, 9 .973181391842445e-08, 2 .5761554667269593e-08, 4 .130220077058766e-08, 3 .9384463690339544e-08, 1 .2158079698565416e-07, 4 .302821707824478e-06, 1 .8179063090428826e-06, 1 .8520155435908237e-06, 1 .6246107179540559e-06, 1 .6448313544970006e-05, 1 .0544916221988387e-05, 3 .993061909568496e-06, 2 .646479799750523e-07, 1 .9193475964129902e-05, 4 .803242745765601e-07, 1 .696285067964709e-07, 4 .550505764200352e-06, 4 .235929372953251e-05, 4 .443338639248395e-06, 5 .104009687784128e-06, 1 .3506396498996764e-05, 4 .1758724478313525e-07, 4 .494491463447048e-07, 3 .156698369366495e-07, 1 .0557599807725637e-06, 1 .336463917311903e-08, 1 .3893659556174498e-08, 6 .770379457066156e-08, 1 .4129696523923485e-07, 7 .170518756538513e-08, 7 .934466594861078e-08, 2 .639154317307657e-08, 2 .6134321373660896e-08, 7 .196725881897237e-09, 2 .1752363466021052e-08, 6 .684639686227456e-08, 3 .417795824134373e-08, 1 .6228275967478112e-07, 4 .107114648377319e-07, 6 .472135396506928e-07, 2 .951379372007068e-07, 5 .653474133282543e-09, 4 .830144462175667e-08, 8 .887481861563629e-09, 3 .7306168820805397e-08, 1 .7784264727538357e-08, 4 .641905082536368e-09, 3 .413118676576232e-08, 1 .937393818707278e-07, 1 .2980176506971475e-06, 3 .5641004814124244e-08, 2 .149332445355867e-08, 3 .055293689158134e-07, 1 .5532516783878236e-07, 1 .4520978766086046e-06, 3 .488464628276233e-08, 3 .825438398052938e-05, 4 .5088432898410247e-07, 4 .1766969616219285e-07, 6 .770622462681786e-07, 1 .4142248971893423e-07, 1 .4235997696232516e-05, 6 .293820433711517e-07, 4 .762866865348769e-06, 9 .024900577969674e-07, 9 .058987870957935e-07, 1 .5713684433649178e-06, 1 .5720647184025438e-07, 1 .818536503606083e-07, 7 .193188622522939e-08, 1 .1952824934269302e-06, 8 .874837362782273e-07, 2 .0870831463071227e-07, 9 .906239029078279e-08, 7 .793621747964607e-09, 1 .0058498389753368e-07, 4 .2059440374941914e-07, 1 .843624630737395e-07, 1 .6437947181202617e-07, 7 .025352743994517e-08, 2 .570448600636155e-07, 7 .586877615040066e-08, 7 .841313731660193e-07, 2 .495309274763713e-07, 5 .157681925993529e-08, 4 .0674127177453556e-08, 7 .531796519799627e-09, 4 .797485431140558e-08, 1 .7419973019627832e-08, 1 .7958679165985814e-07, 1 .2566392371127222e-08, 8 .975440124459055e-08, 3 .26965476915575e-08, 1 .1208359751435637e-07, 3 .906746215420753e-08, 4 .6769045525252295e-08, 1 .8523553535487736e-07, 1 .4833052830454108e-07, 1 .2279349448363064e-07, 1 .0729105497375713e-06, 3 .6538490011395197e-09, 1 .6198403329781286e-07, 1 .6190719875908144e-08, 1 .2004933580556099e-07, 1 .4800277448046018e-08, 4 .02294837442696e-08, 2 .15060893538066e-07, 1 .1925696696835075e-07, 4 .8982514044837444e-08, 7 .608920071788816e-08, 2 .3137479487900237e-08, 8 .521050176568679e-08, 9 .586213423062873e-08, 1 .3351650807180704e-07, 3 .021699157557123e-08, 4 .423876376336011e-08, 2 .610667060309879e-08, 2 .3977091245797055e-07, 1 .3192564551900432e-07, 1 .6734931662654162e-08, 1 .588336999702733e-07, 4 .0643516285854275e-07, 8 .753454494581092e-08, 8 .366999395548191e-07, 3 .437598650180007e-08, 7 .847892646850596e-08, 8 .526394701391382e-09, 9 .601382799928615e-08, 5 .258924034023948e-07, 1 .3557448141909845e-07, 1 .0307226716577134e-07, 1 .0429813457335513e-08, 5 .187714435805901e-08, 2 .187001335585137e-08, 1 .1791439824548888e-08, 2 .98065643278278e-08, 4 .338393466696289e-08, 2 .9991046091026874e-08, 2 .8507610494443725e-08, 3 .058665143385042e-08, 6 .441099031917474e-08, 1 .5364101102477434e-08, 1 .5973883549236234e-08, 2 .5736850872704053e-08, 1 .0903765712555469e-07, 3 .2118737891551064e-08, 6 .819742992547617e-09, 1 .9251311300649832e-07, 5 .8258109447706374e-08, 1 .8765761922168167e-07, 4 .0070790419122204e-07, 1 .5791577823165426e-08, 1 .950158434738114e-07, 1 .0142063189277906e-08, 2 .744815041921811e-08, 1 .2843531571604672e-08, 3 .7297493094001766e-08, 7 .407496838141014e-08, 4 .20607833007125e-08, 1 .6924804668860816e-08, 1 .459203531339881e-07, 4 .344977000414474e-08, 1 .7191403856031684e-07, 3 .5817443233554513e-08, 8 .440249388286247e-09, 4 .194829728021432e-08, 2 .514032360068086e-08, 2 .8340199520471288e-08, 8 .747196034164517e-08, 8 .277125651545703e-09, 1 .1676293709683705e-08, 1 .4548514570833504e-07, 7 .200282148289716e-09, 2 .623600948936655e-06, 5 .675736929333652e-07, 1 .9483527466945816e-06, 6 .752595282932816e-08, 8 .168475318370838e-08, 1 .0933046468153407e-07, 1 .670913718498923e-07, 3 .1387276777650186e-08, 2 .973524537708272e-08, 5 .752163900751839e-08, 5 .850877471402782e-08, 3 .2544622285968217e-07, 3 .330221431951941e-08, 4 .186786668469722e-07, 1 .5085906568401697e-07, 2 .3346819943981245e-07, 2 .86402780602657e-07, 2 .2940319865938363e-07, 1 .8537603807544656e-07, 3 .151798182443599e-07, 1 .1075967449869495e-06, 1 .5369782602192572e-07, 1 .9237509718550427e-07, 1 .64044664074936e-07, 2 .900835340824415e-07, 1 .246654903752642e-07, 5 .802622027317739e-08, 5 .186220519703966e-08, 6 .0094205167615655e-09, 1 .2333241272699524e-07, 1 .3798474185477971e-07, 1 .7370231830682314e-07, 5 .617761189569137e-07, 5 .1604470030497396e-08, 4 .813277598714194e-08, 8 .032698417537176e-08, 2 .0645263703045202e-06, 5 .638597713186755e-07, 8 .794199857220519e-07, 3 .4785980460583232e-06, 2 .972389268052211e-07, 3 .3904532870110415e-07, 9 .469074058188198e-08, 3 .754845678827223e-08, 1 .5679037801419327e-07, 8 .203105039683578e-08, 6 .847962641387539e-09, 1 .8251624211984563e-08, 6 .050240841659615e-08, 3 .956342808919544e-08, 1 .0699947949888156e-07, 3 .2566634899922065e-07, 3 .5369430406717584e-07, 7 .326295303755614e-08, 4 .85765610847011e-07, 7 .717713401689252e-07, 3 .4567779749750116e-08, 3 .246204585138912e-07, 3 .1608601602783892e-06, 5 .33099466792919e-08, 3 .645687343123427e-07, 5 .48158936908294e-07, 4 .62306957160763e-08, 1 .3466177506415988e-07, 4 .3529482240955986e-08, 1 .6404105451783835e-07, 2 .463695381038633e-08, 5 .958712634424046e-08, 9 .493651020875404e-08, 5 .523462576206839e-08, 5 .7412357534758485e-08, 1 .1850350347231142e-05, 5 .8263944993086625e-06, 7 .4208674050169066e-06, 9 .127966222877149e-07, 2 .0019581370434025e-06, 1 .033498961078294e-06, 3 .5146850763112525e-08, 2 .058995278275688e-06, 3 .5655509122989315e-07, 6 .873234070781109e-08, 2 .1935298022413008e-09, 5 .560363547374436e-08, 3 .3266996979364194e-07, 1 .307369217329324e-07, 2 .718762992515167e-08, 1 .0462929189714032e-08, 7 .466680358447775e-07, 6 .923166040451179e-08, 1 .6145664361033596e-08, 8 .568521003837759e-09, 4 .76221018175238e-09, 1 .233977116044116e-07, 8 .340628632197422e-09, 3 .2649041248333788e-09, 5 .0632489312363305e-09, 4 .0704994930251814e-09, 1 .2043538610839732e-08, 5 .105608380517879e-09, 7 .267142887457112e-09, 1 .184516307262129e-07, 7 .53557927168913e-08, 6 .386964201965384e-08, 1 .6212936770898523e-08, 2 .610429419291904e-07, 6 .979425393183192e-07, 6 .647513117741255e-08, 7 .717492849224072e-07, 6 .651206945207377e-07, 3 .324495310152997e-07, 3 .707282019149716e-07, 3 .99564243025452e-07, 6 .411632114122767e-08, 7 .107352217872176e-08, 1 .6380016631956096e-07, 6 .876800995314625e-08, 3 .462474467141874e-07, 2 .0256503319160402e-07, 6 .19610148078209e-07, 2 .6841073363925716e-08, 6 .720335363752383e-07, 1 .1348340649419697e-06, 1 .8397931853542104e-06, 6 .397251581802266e-07, 7 .257533241045167e-08, 4 .2213909523525217e-07, 3 .9657925299252383e-07, 1 .4037439655112394e-07, 3 .249856774800719e-07, 1 .5857655455420172e-07, 1 .1122217102865761e-07, 7 .391420808744442e-08, 3 .42322238111592e-07, 5 .39796154441774e-08, 8 .517296379295658e-08, 4 .061009803990601e-06, 1 .4478755474556237e-05, 7 .317032757470088e-09, 6 .9484960008026064e-09, 4 .468917325084476e-08, 9 .23141172393116e-08, 5 .411982328951126e-08, 2 .2242811326123046e-07, 1 .7609554703312824e-08, 2 .0906279374344194e-08, 3 .6797682678724186e-09, 6 .177919686933819e-08, 1 .7920288541972695e-07, 2 .6279179721200308e-08, 2 .6988200119149042e-08, 1 .6432807115052128e-07, 1 .2827612749788386e-07, 4 .468908798571647e-08, 6 .316552969565237e-08, 1 .9461760203398626e-08, 2 .087125849925542e-08, 2 .2414580413965268e-08, 2 .4765244077684656e-08, 6 .785398465325443e-09, 2 .4248794971981624e-08, 4 .554979504689527e-09, 2 .8977037658250993e-08, 2 .0402325162649504e-08, 1 .600950270130852e-07, 2 .0199709638291097e-07, 1 .611188515937556e-08, 5 .964113825029926e-08, 4 .098318573397819e-09, 3 .9080127578472457e-08, 7 .511338218080255e-09, 5 .965624154669058e-07, 1 .6478223585636442e-07, 1 .4106989354445432e-08, 3 .2855584919389e-08, 3 .3387166364917675e-09, 1 .220043444050134e-08, 4 .624639160510924e-08, 6 .842309385746148e-09, 1 .74262879681919e-08, 4 .6611329906909305e-08, 9 .331947836699328e-08, 1 .2306078644996887e-07, 1 .2359445022980253e-08, 1 .1173199254699284e-08, 2 .7724862405875683e-08, 2 .419210147763806e-07, 3 .451186785241589e-07, 2 .593766978975509e-08, 9 .964568192799561e-08, 9 .797809674694236e-09, 1 .9085564417764544e-07, 3 .972706252852731e-08, 2 .6639204619982593e-08, 6 .874148805735558e-09, 3 .146993776681484e-08, 2 .4086594407890516e-07, 1 .3126927456141857e-07, 2 .1254339799270383e-07, 2 .050203384840188e-08, 3 .694976058454813e-08, 6 .563175816154398e-07, 2 .560050127442537e-08, 2 .6882981174480847e-08, 6 .880636078676616e-07, 2 .0092733166166e-07, 2 .788039665801989e-08, 2 .628409134786125e-08, 5 .1678345158734373e-08, 1 .8935413947929192e-07, 4 .61852835087484e-07, 1 .1086777718105623e-08, 1 .4542604276357451e-07, 2 .8737009216683873e-08, 6 .105167926762078e-07, 1 .2016463379893594e-08, 1 .3944705301582871e-07, 2 .093712758721722e-08, 4 .3801410498645055e-08, 1 .966320795077081e-08, 6 .654448991838535e-09, 1 .1149590584125235e-08, 6 .424939158478082e-08, 6 .971554888934861e-09, 3 .260019587614238e-09, 1 .4260189473702667e-08, 2 .7895078247297533e-08, 8 .11578289017234e-08, 2 .5995715802196173e-08, 2 .2855578762914774e-08, 1 .055962854934478e-07, 8 .145542551574181e-08, 3 .7793686402665116e-08, 4 .881891513264236e-08, 2 .342062366267328e-08, 1 .059935517133681e-08, 3 .604105103249822e-08, 5 .062430830093945e-08, 3 .6804440384230475e-08, 1 .501580193519203e-09, 1 .4475033367489232e-06, 1 .076210423889279e-06, 1 .304991315009829e-07, 3 .073601462233455e-08, 1 .7184021317007137e-08, 2 .0421090596300928e-08, 7 .904992216367646e-09, 1 .6902052379919041e-07, 1 .2416506933732308e-08, 5 .4758292122869534e-08, 2 .6250422280327257e-08, 1 .3261367115546818e-08, 6 .29807459517906e-08, 1 .270998595259698e-08, 2 .0171681569536304e-07, 4 .386637186826192e-08, 6 .962349630157405e-08, 2 .9565120485131047e-07, 7 .925131626507209e-07, 2 .0868920103112032e-07, 1 .7341794489311724e-07, 4 .2942417621816276e-08, 4 .213406956665722e-09, 8 .824785169281313e-08, 1 .7341569957807224e-08, 7 .321587247588468e-08, 1 .7941774288487977e-08, 1 .1245148101579616e-07, 4 .242405395871174e-07, 8 .259573469615589e-09, 1 .1336403105133286e-07, 8 .268798978861014e-08, 2 .2186977588489754e-08, 1 .9539720952366224e-08, 1 .0675703876472653e-08, 3 .288517547161973e-08, 2 .4340963022950746e-08, 6 .639137239972115e-08, 5 .604687380866835e-09, 1 .386604697728444e-08, 6 .675873720496384e-08, 1 .1355886009312144e-08, 3 .132159633878473e-07, 3 .12451788886392e-08, 1 .502181845580708e-07, 1 .3461754377885882e-08, 1 .8882955998833495e-07, 4 .645742279762999e-08, 4 .6453880742092224e-08, 7 .714453964524637e-09, 3 .5857155467056145e-08, 7 .60832108426257e-09, 4 .221501370693659e-08, 4 .3407251126836854e-09, 1 .340157496088068e-08, 8 .565600495558101e-08, 1 .7045413969185574e-08, 5 .4221903411644234e-08, 3 .021912675649219e-08, 6 .153376119755194e-08, 3 .938857240370908e-09, 4 .135628017820636e-08, 1 .781920389021252e-08, 4 .3105885083605244e-08, 3 .903354972578654e-09, 7 .663085455078544e-08, 1 .1890405993142394e-08, 9 .304217840622186e-09, 1 .0968062014171664e-09, 1 .0536767902635802e-08, 1 .1516804221400889e-07, 8 .134522886393825e-07, 5 .952623993721318e-08, 2 .806350174466843e-08, 1 .2833099027886874e-08, 1 .0605690192733164e-07, 7 .872949936427176e-07, 2 .7501393162765453e-08, 3 .936289072470345e-09, 2 .0519442145428002e-08, 7 .394815870753746e-09, 3 .598397313453461e-08, 2 .5378517065632877e-08, 4 .698972233541099e-08, 7 .54952989012736e-09, 6 .322805461422831e-07, 5 .582006412652163e-09, 1 .29640980617296e-07, 1 .5874988434916304e-08, 3 .3837810775594335e-08, 6 .474512037613067e-09, 9 .121148281110436e-08, 1 .3918511676536127e-08, 8 .230025549949005e-09, 2 .7061290097663004e-08, 2 .6095918315149902e-08, 5 .722363471960534e-09, 6 .963475698285038e-07, 4 .685091781198025e-08, 9 .590579885809802e-09, 2 .099205858030473e-07, 3 .082160660028421e-08, 3 .563162565001221e-08, 7 .326312925215461e-07, 2 .1759731225756695e-06, 2 .407518309155421e-07, 2 .974515780351794e-07, 2 .529018416908002e-08, 7 .667950718825978e-09, 2 .663289251358947e-07, 3 .4358880185436647e-08, 2 .3130198201215535e-08, 3 .1239693498719134e-08, 2 .8691621878351725e-07, 3 .895845068768722e-08, 2 .4184130253956937e-08, 1 .1582445225144511e-08, 5 .1545349322168477e-08, 2 .034345492063494e-08, 8 .201963197507212e-08, 1 .164153573540716e-08, 5 .496356720868789e-07, 1 .1682151246361627e-08, 4 .7576914852243135e-08, 1 .6349824605299546e-08, 4 .090862759653646e-08, 2 .1271189609706198e-07, 1 .6697286753242224e-07, 3 .989708119433999e-08, 2 .852450279533514e-06, 1 .2500372292834072e-07, 2 .4846613655427063e-07, 1 .245429093188477e-08, 2 .9700272463628608e-08, 4 .250991558762962e-09, 1 .61443480806156e-07, 2 .6386018703306036e-07, 7 .638056409575711e-09, 3 .4455793773702226e-09, 7 .273289526210647e-08, 1 .7631434090503717e-08, 7 .58661311550668e-09, 2 .1547013062672704e-08, 1 .2675349125856883e-07, 2 .5637149292379036e-08, 3 .500976220038865e-08, 6 .472243541111311e-08, 8 .387915251262257e-09, 3 .069512288789156e-08, 7 .520387867998579e-08, 1 .5724964441687916e-07, 1 .9634005354873807e-07, 1 .2290831818972947e-07, 1 .112118730439704e-09, 1 .546895944670723e-08, 9 .91701032404535e-09, 6 .882473257974198e-07, 8 .267616635748709e-08, 4 .469531234008173e-08, 2 .075201344098332e-08, 8 .649378457903367e-08, 5 .202766573120243e-08, 4 .5564942041664835e-08, 2 .0319955496006514e-08, 8 .705182352741758e-09, 6 .452066969586667e-08, 2 .1777438519166026e-08, 1 .030954166481024e-08, 3 .211904342492744e-08, 2 .3336936294526822e-07, 8 .054096056753224e-09, 1 .9623354319264763e-07, 1 .2888089884199871e-07, 1 .5392496166555247e-08, 1 .401903038100727e-09, 5 .696818305978013e-08, 6 .080025372057207e-09, 1 .0782793324892737e-08, 2 .4260730313585555e-08, 1 .9388659566743627e-08, 2 .2970310453729326e-07, 1 .9971754028347277e-08, 2 .8477993296860404e-08, 5 .2273552597625894e-08, 2 .7392806600801123e-07, 9 .857291161097237e-08, 3 .12910977129377e-08, 4 .151442212219081e-08, 5 .251196366629074e-09, 1 .580681100676884e-06, 8 .547603442821128e-07, 1 .068913135782168e-08, 1 .0621830597301596e-06, 7 .737313012512459e-08, 6 .394216711669287e-08, 1 .1698345758759388e-07, 1 .0486609625104393e-07, 2 .1161000063329993e-07, 1 .53396815250062e-08, 5 .094453570109181e-08, 1 .4005379966874898e-08, 2 .6282036102998063e-08, 8 .778433624456738e-08, 7 .772066545896905e-09, 4 .228875383205377e-08, 3 .3243779284930497e-07, 7 .729244799747903e-08, 7 .636901111496286e-10, 5 .989500806435899e-08, 1 .326090597331131e-07, 1 .2853634245857393e-07, 8 .844242671557367e-09, 1 .0194374766570036e-07, 2 .493779334145074e-07, 1 .6547971881664125e-07, 1 .1762754326127833e-08, 1 .1496195639892903e-07, 2 .9342709240154363e-07, 1 .326124099421122e-08, 8 .630262726683213e-08, 5 .7394842656322e-08, 1 .1094081031615133e-07, 2 .2933713239581266e-07, 3 .4706170026765903e-07, 1 .4751107357824367e-07, 1 .502495017291494e-08, 6 .454319390059027e-08, 5 .164533689594464e-08, 6 .23741556182722e-08, 1 .293601457064142e-07, 1 .4052071506398534e-08, 5 .386946000385251e-08, 2 .0827554791935654e-08, 1 .3040637902861363e-08, 1 .0578981601838677e-07, 1 .5079727688771527e-08, 8 .92632726845477e-07, 4 .6374381668101705e-08, 7 .481006036869076e-07, 5 .883147302654379e-09, 2 .8707685117979054e-09, 8 .381598490814213e-07, 7 .341958596640552e-09, 1 .4245998158912698e-08, 1 .0926417104428765e-07, 1 .1308178216040687e-07, 2 .52339901862797e-07, 1 .1782835684925885e-07, 4 .6678056975224536e-08, 2 .7959197179683315e-09, 3 .4363861090014325e-08, 1 .4674496640054713e-07, 3 .5396915620822256e-08, 2 .0581127557761647e-07, 7 .18387909159901e-08, 2 .7693943138729082e-08, 4 .5493386835460115e-08, 1 .9559182717898693e-08, 1 .5359708172013598e-08, 1 .2336623278486059e-08, 2 .9570605519779747e-08, 2 .877552560676122e-07, 9 .051845495378075e-07, 2 .3732602016934834e-07, 1 .6521676471370483e-08, 1 .5478875070584763e-08, 3 .526786329643983e-08, 3 .616410637619083e-08, 1 .61590953950963e-08, 7 .65007328595857e-08, 1 .9661483108279754e-08, 4 .917534823789538e-08, 1 .1712612746350715e-07, 1 .0889253054813253e-08, 1 .494120169809321e-06, 1 .018585660261806e-08, 3 .7575969003000864e-08, 2 .097097784314883e-08, 3 .368558054717141e-08, 4 .845588819080149e-09, 6 .039624622644624e-07, 1 .037331109898787e-08, 2 .841650257323636e-07, 4 .4990630954089283e-07, 3 .463186004637464e-08, 7 .720684180867465e-08, 1 .471122175189521e-07, 1 .1601575522490748e-07, 4 .007488030310924e-07, 3 .025649775167949e-08, 6 .706784461130155e-08, 2 .0128741340386114e-08, 1 .5987744461654074e-09, 4 .1919822280078733e-08, 1 .3167154477855547e-08, 3 .231814815762846e-08, 9 .247659704669786e-08, 1 .3075300842047e-07, 1 .0574301256838226e-07, 3 .762165334819656e-08, 1 .0942246575496029e-07, 7 .001474955359299e-08, 2 .742706151082075e-08, 2 .0766625752344225e-08, 4 .5403403703403455e-08, 3 .39040298058535e-08, 1 .0469661759771043e-07, 2 .8271578855765256e-08, 3 .406226767310727e-07, 5 .146206945028098e-07, 6 .740708613506285e-07, 6 .382248063374618e-09, 3 .63878704945364e-08, 3 .626059807970705e-08, 1 .6065602892467723e-07, 3 .639055989879125e-07, 6 .232691696084203e-09, 4 .805490050330263e-08, 3 .372633727849461e-08, 6 .328880317596486e-07, 6 .480631498106959e-08, 2 .1165197949812864e-07, 8 .38779143919055e-08, 1 .7589144363228115e-08, 2 .729027670511641e-09, 2 .144795097080987e-08, 7 .861271456022223e-08, 2 .0118186228046397e-08, 2 .8407685093156942e-08, 2 .4922530883486615e-07, 2 .0156670998972004e-08, 2 .6551649767725394e-08, 2 .7848242822869906e-08, 6 .907123761834555e-09, 1 .880543720744754e-08, 1 .3006903998302732e-08, 3 .685918272822164e-07, 3 .967941211158177e-07, 2 .7592133022835696e-08, 2 .5228947819755376e-08, 1 .547002881352455e-07, 3 .689306637966183e-08, 1 .440177199718562e-09, 2 .1504929392790473e-08, 5 .068111263994979e-08, 5 .081711407228795e-08, 1 .171875219085905e-08, 5 .409278358570191e-08, 7 .138276600926474e-07, 2 .5237213208129106e-07, 7 .072044638789521e-08, 7 .199763984999663e-08, 1 .2525473103153217e-08, 3 .4803417747752974e-07, 1 .9591827538079087e-07, 1 .2404700555634918e-07, 1 .234617457157583e-07, 1 .9201337408958352e-08, 1 .9895249181445251e-07, 3 .7876677794201896e-08, 1 .0629785052174157e-08, 1 .2437127772102485e-08, 2 .1861892207653e-07, 2 .6181456291851646e-07, 1 .112900775979142e-07, 1 .0776630432474121e-07, 6 .380325157095967e-09, 3 .895085143312826e-09, 1 .5762756788717525e-07, 2 .909027019271093e-09, 1 .0381050685737137e-08, 2 .8135211493918177e-08, 1 .0778002490496874e-08, 1 .3605974125141529e-08, 2 .9236465692861202e-08, 1 .9189795352758665e-07, 2 .199506354827463e-07, 1 .326399790002597e-08, 4 .9004846403022384e-08, 2 .980837132682268e-09, 8 .926045680368588e-09, 1 .0996975774446582e-08, 7 .71560149104289e-09, 7 .454491246505768e-09, 5 .086162246925596e-08, 1 .5129764108223753e-07, 1 .1960075596562092e-08, 1 .1323334270230134e-08, 9 .391332156383214e-09, 9 .585701832293125e-08, 1 .905532798218701e-08, 1 .8105303922766325e-08, 6 .179227796110354e-08, 6 .389401363549041e-08, 1 .1853179771037503e-08, 9 .37277544466042e-09, 1 .2332148457971925e-07, 1 .6522022860954166e-08, 1 .246116454467483e-07, 4 .196171854431441e-09, 3 .996593278543514e-08, 1 .2554556505506298e-08, 1 .4302138140465104e-08, 6 .631793780798034e-09, 5 .964224669696705e-09, 5 .556936244488497e-09, 1 .4192455921602232e-07, 1 .7613080771639034e-08, 3 .380189639301534e-07, 7 .85651934620546e-08, 2 .966783085867064e-08, 2 .8992105853831163e-06, 1 .3787366697215475e-06, 5 .313622430946907e-09, 2 .512852859126724e-08, 8 .406627216572815e-08, 4 .492839167369311e-08, 5 .408793057881667e-08, 2 .4239175999696272e-08, 4 .016805235096399e-07, 4 .1083545454512205e-08, 5 .4153481698904216e-08, 8 .640767212853007e-09, 5 .773256717134245e-08, 2 .6443152023603034e-07, 8 .953217047746875e-07, 2 .7994001783326894e-08, 5 .889480014786841e-09, 4 .1788819515886644e-08, 2 .8880645430717777e-08, 2 .135752907861388e-08, 2 .3024175277441827e-07, 8 .786625471657317e-08, 2 .0697297209437693e-09, 2 .236410523437371e-08, 3 .203276310870251e-09, 1 .176874686592555e-08, 6 .963571053120177e-08, 2 .271932153519174e-08, 7 .360382525689602e-09, 6 .922528772435044e-09, 3 .213871480056696e-08, 1 .370577820125618e-07, 1 .9815049157045905e-08, 1 .0578956377571558e-08, 2 .7049420481262132e-08, 2 .9755937713815683e-09, 2 .1773699288019088e-08, 1 .09755387001087e-08, 1 .991872444762066e-08, 2 .3882098076910552e-08, 2 .1357365653784655e-08, 6 .109098560358461e-09, 1 .1890497475519624e-08, 1 .1459891702259029e-08, 3 .73173456580389e-08, 1 .572620256240498e-08, 3 .404023374287135e-08, 3 .6921580459647885e-08, 9 .281765045443535e-08, 1 .2323201303843234e-07, 4 .2347593876002065e-08, 1 .7423728237986325e-08, 5 .8113389656000436e-08, 3 .931436154402945e-08, 2 .3690461148362374e-08, 1 .792850135018398e-08, 1 .440664210150544e-08, 7 .019830494670032e-09, 6 .041522482291839e-08, 4 .867479930226182e-08, 1 .0685319296044327e-08, 1 .0051243393149889e-08, 4 .2426261614991745e-08, 2 .607815297039906e-08, 5 .136670200300 841e-09, 1 .69729952315123e-09, 1 .9131586981302462e-08, 2 .111743526711507e-07, 1 .337269672774255e-08, 2 .0002481448955223e-08, 1 .0454256482717028e-07, 2 .8144228281234973e-08, 2 .1344791889532644e-07, 2 .1046110632028103e-08, 1 .9114453664315079e-07, 3 .957693550660224e-08, 2 .931631826186276e-08, 1 .105203111251285e-07, 4 .84007678380749e-08, 5 .583606110803885e-08, 1 .2130111315400427e-07, 1 .77621615193857e-08, 2 .5610853882085394e-08, 1 .203865309662433e-07, 4 .674859610531712e-09, 1 .5916098661250544e-08, 3 .147594185293201e-08, 6 .147686093527227e-08, 2 .204641802450169e-08, 3 .257763410147163e-07, 1 .198914532096751e-07, 2 .3818989802748547e-07, 1 .4909986134625797e-08, 5 .10168831624469e-08, 5 .5142201915714395e-08, 2 .288550327023131e-08, 5 .714110073995471e-08, 5 .185095801607531e-07, 4 .977285783525076e-08, 1 .1049896109227575e-08, 1 .264099296349741e-07, 8 .174881571676451e-08 ]]} * Connection #0 to host localhost left intact","title":"Run a Prediction"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/paddle/#test-the-model-locally","text":"Once you've got your model serialised model.pdmodel , we can then use KServe Paddle Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/paddle/#using-kserve-paddleserver","text":"","title":"Using KServe PaddleServer"},{"location":"modelserving/v1beta1/paddle/#pre-requisites","text":"Firstly, to use KServe Paddle server locally, you will first need to install the paddleserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install paddleserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/paddleserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/paddle/#serving-model-locally","text":"The paddleserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the paddleserver runtime package installed locally, you should now be ready to start our server as: python3 paddleserver --model_dir /path/to/model_dir --model_name paddle-v2-resnet50","title":"Serving model locally"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f paddle-v2-resnet.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/paddle/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can use the example payload jay-v2.json as the sample input to test the model. Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/paddle-v2-resnet50/infer Expected Output { \"model_name\" : \"paddle-v2-resnet50\" , \"model_version\" : null , \"id\" : \"afa5ec0b-a5c7-454b-a464-53ba931b22df\" , \"parameters\" : null , \"outputs\" :[{ \"name\" : \"output-0\" , \"shape\" :[ 1 , 1000 ], \"datatype\" : \"FP32\" , \"parameters\" : null , \"data\" :[ 6.736678770380422e-9 , 1.1535990829258935e-8 , 5.142250714129659e-8 , 6.647170636142619e-8 , 4.094492567219277e-8 , 1.3402451770616608e-7 , 9.355561303436843e-8 , 2.8935891904779965e-8 , 6.845367295227334e-8 , 7.680615965455218e-8 , 0.0000020334689452283783 , 0.0000011085678579547675 , 2.3477592492326949e-7 , 6.582037030966603e-7 , 0.00012373103527352214 , 4.2878804151769145e-7 , 0.000006419959845516132 , 0.9993496537208557 , 0.00007372002437477931 , 0.00003101135735050775 , 0.0000056028093240456656 , 0.0000021862508674530545 , 1.9544044604913324e-8 , 3.728893887000595e-7 , 4.2903633357127546e-7 , 1.8251179767503345e-7 , 7.159925985433802e-8 , 9.231618136595898e-9 , 6.469241498052725e-7 , 7.031690341108288e-9 , 4.451231561120039e-8 , 1.2455971898361895e-7 , 9.44632745358831e-8 , 4.347704418705689e-8 , 4.658220120745682e-7 , 6.797721141538204e-8 , 2.1060276367279585e-7 , 2.2605123106700376e-8 , 1.4311490303953178e-7 , 7.951298641728499e-8 , 1.2341783417468832e-7 , 0.0000010921713737843675 , 0.000015243892448779661 , 3.1173343018053856e-7 , 2.4152058131221565e-7 , 6.863762536113427e-8 , 8.467682022228473e-8 , 9.4246772164297e-8 , 1.0219210366813058e-8 , 3.3770753304906975e-8 , 3.6928835100979995e-8 , 1.3694031508748594e-7 , 1.0674284567357972e-7 , 2.599483650556067e-7 , 3.4866405940192635e-7 , 3.132053549848024e-8 , 3.574873232992104e-7 , 6.64843895492595e-8 , 3.1638955988455564e-7 , 0.0000012095878219042788 , 8.66409024524728e-8 , 4.0144172430700564e-8 , 1.2544761318622477e-7 , 3.3201178695208e-8 , 1.9731444922399533e-7 , 3.806405572959193e-7 , 1.3827865075199952e-7 , 2.300225965257141e-8 , 7.14422512260171e-8 , 2.851114544455413e-8 , 2.982567437470607e-8 , 8.936032713791064e-8 , 6.22388370175031e-7 , 6.478838798784636e-8 , 1.3663023423760023e-7 , 9.973181391842445e-8 , 2.5761554667269593e-8 , 4.130220077058766e-8 , 3.9384463690339544e-8 , 1.2158079698565416e-7 , 0.000004302821707824478 , 0.0000018179063090428826 , 0.0000018520155435908237 , 0.0000016246107179540559 , 0.000016448313544970006 , 0.000010544916221988387 , 0.000003993061909568496 , 2.646479799750523e-7 , 0.000019193475964129902 , 4.803242745765601e-7 , 1.696285067964709e-7 , 0.000004550505764200352 , 0.00004235929372953251 , 0.000004443338639248395 , 0.000005104009687784128 , 0.000013506396498996764 , 4.1758724478313525e-7 , 4.494491463447048e-7 , 3.156698369366495e-7 , 0.0000010557599807725637 , 1.336463917311903e-8 , 1.3893659556174498e-8 , 6.770379457066156e-8 , 1.4129696523923485e-7 , 7.170518756538513e-8 , 7.934466594861078e-8 , 2.639154317307657e-8 , 2.6134321373660896e-8 , 7.196725881897237e-9 , 2.1752363466021052e-8 , 6.684639686227456e-8 , 3.417795824134373e-8 , 1.6228275967478112e-7 , 4.107114648377319e-7 , 6.472135396506928e-7 , 2.951379372007068e-7 , 5.653474133282543e-9 , 4.830144462175667e-8 , 8.887481861563629e-9 , 3.7306168820805397e-8 , 1.7784264727538357e-8 , 4.641905082536368e-9 , 3.413118676576232e-8 , 1.937393818707278e-7 , 0.0000012980176506971475 , 3.5641004814124244e-8 , 2.149332445355867e-8 , 3.055293689158134e-7 , 1.5532516783878236e-7 , 0.0000014520978766086046 , 3.488464628276233e-8 , 0.00003825438398052938 , 4.5088432898410247e-7 , 4.1766969616219285e-7 , 6.770622462681786e-7 , 1.4142248971893423e-7 , 0.000014235997696232516 , 6.293820433711517e-7 , 0.000004762866865348769 , 9.024900577969674e-7 , 9.058987870957935e-7 , 0.0000015713684433649178 , 1.5720647184025438e-7 , 1.818536503606083e-7 , 7.193188622522939e-8 , 0.0000011952824934269302 , 8.874837362782273e-7 , 2.0870831463071227e-7 , 9.906239029078279e-8 , 7.793621747964607e-9 , 1.0058498389753368e-7 , 4.2059440374941914e-7 , 1.843624630737395e-7 , 1.6437947181202617e-7 , 7.025352743994517e-8 , 2.570448600636155e-7 , 7.586877615040066e-8 , 7.841313731660193e-7 , 2.495309274763713e-7 , 5.157681925993529e-8 , 4.0674127177453556e-8 , 7.531796519799627e-9 , 4.797485431140558e-8 , 1.7419973019627832e-8 , 1.7958679165985814e-7 , 1.2566392371127222e-8 , 8.975440124459055e-8 , 3.26965476915575e-8 , 1.1208359751435637e-7 , 3.906746215420753e-8 , 4.6769045525252295e-8 , 1.8523553535487736e-7 , 1.4833052830454108e-7 , 1.2279349448363064e-7 , 0.0000010729105497375713 , 3.6538490011395197e-9 , 1.6198403329781286e-7 , 1.6190719875908144e-8 , 1.2004933580556099e-7 , 1.4800277448046018e-8 , 4.02294837442696e-8 , 2.15060893538066e-7 , 1.1925696696835075e-7 , 4.8982514044837444e-8 , 7.608920071788816e-8 , 2.3137479487900237e-8 , 8.521050176568679e-8 , 9.586213423062873e-8 , 1.3351650807180704e-7 , 3.021699157557123e-8 , 4.423876376336011e-8 , 2.610667060309879e-8 , 2.3977091245797055e-7 , 1.3192564551900432e-7 , 1.6734931662654162e-8 , 1.588336999702733e-7 , 4.0643516285854275e-7 , 8.753454494581092e-8 , 8.366999395548191e-7 , 3.437598650180007e-8 , 7.847892646850596e-8 , 8.526394701391382e-9 , 9.601382799928615e-8 , 5.258924034023948e-7 , 1.3557448141909845e-7 , 1.0307226716577134e-7 , 1.0429813457335513e-8 , 5.187714435805901e-8 , 2.187001335585137e-8 , 1.1791439824548888e-8 , 2.98065643278278e-8 , 4.338393466696289e-8 , 2.9991046091026874e-8 , 2.8507610494443725e-8 , 3.058665143385042e-8 , 6.441099031917474e-8 , 1.5364101102477434e-8 , 1.5973883549236234e-8 , 2.5736850872704053e-8 , 1.0903765712555469e-7 , 3.2118737891551064e-8 , 6.819742992547617e-9 , 1.9251311300649832e-7 , 5.8258109447706374e-8 , 1.8765761922168167e-7 , 4.0070790419122204e-7 , 1.5791577823165426e-8 , 1.950158434738114e-7 , 1.0142063189277906e-8 , 2.744815041921811e-8 , 1.2843531571604672e-8 , 3.7297493094001766e-8 , 7.407496838141014e-8 , 4.20607833007125e-8 , 1.6924804668860816e-8 , 1.459203531339881e-7 , 4.344977000414474e-8 , 1.7191403856031684e-7 , 3.5817443233554513e-8 , 8.440249388286247e-9 , 4.194829728021432e-8 , 2.514032360068086e-8 , 2.8340199520471288e-8 , 8.747196034164517e-8 , 8.277125651545703e-9 , 1.1676293709683705e-8 , 1.4548514570833504e-7 , 7.200282148289716e-9 , 0.000002623600948936655 , 5.675736929333652e-7 , 0.0000019483527466945816 , 6.752595282932816e-8 , 8.168475318370838e-8 , 1.0933046468153407e-7 , 1.670913718498923e-7 , 3.1387276777650186e-8 , 2.973524537708272e-8 , 5.752163900751839e-8 , 5.850877471402782e-8 , 3.2544622285968217e-7 , 3.330221431951941e-8 , 4.186786668469722e-7 , 1.5085906568401697e-7 , 2.3346819943981245e-7 , 2.86402780602657e-7 , 2.2940319865938363e-7 , 1.8537603807544656e-7 , 3.151798182443599e-7 , 0.0000011075967449869495 , 1.5369782602192572e-7 , 1.9237509718550427e-7 , 1.64044664074936e-7 , 2.900835340824415e-7 , 1.246654903752642e-7 , 5.802622027317739e-8 , 5.186220519703966e-8 , 6.0094205167615655e-9 , 1.2333241272699524e-7 , 1.3798474185477971e-7 , 1.7370231830682314e-7 , 5.617761189569137e-7 , 5.1604470030497396e-8 , 4.813277598714194e-8 , 8.032698417537176e-8 , 0.0000020645263703045202 , 5.638597713186755e-7 , 8.794199857220519e-7 , 0.0000034785980460583232 , 2.972389268052211e-7 , 3.3904532870110415e-7 , 9.469074058188198e-8 , 3.754845678827223e-8 , 1.5679037801419327e-7 , 8.203105039683578e-8 , 6.847962641387539e-9 , 1.8251624211984563e-8 , 6.050240841659615e-8 , 3.956342808919544e-8 , 1.0699947949888156e-7 , 3.2566634899922065e-7 , 3.5369430406717584e-7 , 7.326295303755614e-8 , 4.85765610847011e-7 , 7.717713401689252e-7 , 3.4567779749750116e-8 , 3.246204585138912e-7 , 0.0000031608601602783892 , 5.33099466792919e-8 , 3.645687343123427e-7 , 5.48158936908294e-7 , 4.62306957160763e-8 , 1.3466177506415988e-7 , 4.3529482240955986e-8 , 1.6404105451783835e-7 , 2.463695381038633e-8 , 5.958712634424046e-8 , 9.493651020875404e-8 , 5.523462576206839e-8 , 5.7412357534758485e-8 , 0.000011850350347231142 , 0.0000058263944993086625 , 0.0000074208674050169066 , 9.127966222877149e-7 , 0.0000020019581370434025 , 0.000001033498961078294 , 3.5146850763112525e-8 , 0.000002058995278275688 , 3.5655509122989315e-7 , 6.873234070781109e-8 , 2.1935298022413008e-9 , 5.560363547374436e-8 , 3.3266996979364194e-7 , 1.307369217329324e-7 , 2.718762992515167e-8 , 1.0462929189714032e-8 , 7.466680358447775e-7 , 6.923166040451179e-8 , 1.6145664361033596e-8 , 8.568521003837759e-9 , 4.76221018175238e-9 , 1.233977116044116e-7 , 8.340628632197422e-9 , 3.2649041248333788e-9 , 5.0632489312363305e-9 , 4.0704994930251814e-9 , 1.2043538610839732e-8 , 5.105608380517879e-9 , 7.267142887457112e-9 , 1.184516307262129e-7 , 7.53557927168913e-8 , 6.386964201965384e-8 , 1.6212936770898523e-8 , 2.610429419291904e-7 , 6.979425393183192e-7 , 6.647513117741255e-8 , 7.717492849224072e-7 , 6.651206945207377e-7 , 3.324495310152997e-7 , 3.707282019149716e-7 , 3.99564243025452e-7 , 6.411632114122767e-8 , 7.107352217872176e-8 , 1.6380016631956096e-7 , 6.876800995314625e-8 , 3.462474467141874e-7 , 2.0256503319160402e-7 , 6.19610148078209e-7 , 2.6841073363925716e-8 , 6.720335363752383e-7 , 0.0000011348340649419697 , 0.0000018397931853542104 , 6.397251581802266e-7 , 7.257533241045167e-8 , 4.2213909523525217e-7 , 3.9657925299252383e-7 , 1.4037439655112394e-7 , 3.249856774800719e-7 , 1.5857655455420172e-7 , 1.1122217102865761e-7 , 7.391420808744442e-8 , 3.42322238111592e-7 , 5.39796154441774e-8 , 8.517296379295658e-8 , 0.000004061009803990601 , 0.000014478755474556237 , 7.317032757470088e-9 , 6.9484960008026064e-9 , 4.468917325084476e-8 , 9.23141172393116e-8 , 5.411982328951126e-8 , 2.2242811326123046e-7 , 1.7609554703312824e-8 , 2.0906279374344194e-8 , 3.6797682678724186e-9 , 6.177919686933819e-8 , 1.7920288541972695e-7 , 2.6279179721200308e-8 , 2.6988200119149042e-8 , 1.6432807115052128e-7 , 1.2827612749788386e-7 , 4.468908798571647e-8 , 6.316552969565237e-8 , 1.9461760203398626e-8 , 2.087125849925542e-8 , 2.2414580413965268e-8 , 2.4765244077684656e-8 , 6.785398465325443e-9 , 2.4248794971981624e-8 , 4.554979504689527e-9 , 2.8977037658250993e-8 , 2.0402325162649504e-8 , 1.600950270130852e-7 , 2.0199709638291097e-7 , 1.611188515937556e-8 , 5.964113825029926e-8 , 4.098318573397819e-9 , 3.9080127578472457e-8 , 7.511338218080255e-9 , 5.965624154669058e-7 , 1.6478223585636442e-7 , 1.4106989354445432e-8 , 3.2855584919389e-8 , 3.3387166364917675e-9 , 1.220043444050134e-8 , 4.624639160510924e-8 , 6.842309385746148e-9 , 1.74262879681919e-8 , 4.6611329906909305e-8 , 9.331947836699328e-8 , 1.2306078644996887e-7 , 1.2359445022980253e-8 , 1.1173199254699284e-8 , 2.7724862405875683e-8 , 2.419210147763806e-7 , 3.451186785241589e-7 , 2.593766978975509e-8 , 9.964568192799561e-8 , 9.797809674694236e-9 , 1.9085564417764544e-7 , 3.972706252852731e-8 , 2.6639204619982593e-8 , 6.874148805735558e-9 , 3.146993776681484e-8 , 2.4086594407890516e-7 , 1.3126927456141857e-7 , 2.1254339799270383e-7 , 2.050203384840188e-8 , 3.694976058454813e-8 , 6.563175816154398e-7 , 2.560050127442537e-8 , 2.6882981174480847e-8 , 6.880636078676616e-7 , 2.0092733166166e-7 , 2.788039665801989e-8 , 2.628409134786125e-8 , 5.1678345158734373e-8 , 1.8935413947929192e-7 , 4.61852835087484e-7 , 1.1086777718105623e-8 , 1.4542604276357451e-7 , 2.8737009216683873e-8 , 6.105167926762078e-7 , 1.2016463379893594e-8 , 1.3944705301582871e-7 , 2.093712758721722e-8 , 4.3801410498645055e-8 , 1.966320795077081e-8 , 6.654448991838535e-9 , 1.1149590584125235e-8 , 6.424939158478082e-8 , 6.971554888934861e-9 , 3.260019587614238e-9 , 1.4260189473702667e-8 , 2.7895078247297533e-8 , 8.11578289017234e-8 , 2.5995715802196173e-8 , 2.2855578762914774e-8 , 1.055962854934478e-7 , 8.145542551574181e-8 , 3.7793686402665116e-8 , 4.881891513264236e-8 , 2.342062366267328e-8 , 1.059935517133681e-8 , 3.604105103249822e-8 , 5.062430830093945e-8 , 3.6804440384230475e-8 , 1.501580193519203e-9 , 0.0000014475033367489232 , 0.000001076210423889279 , 1.304991315009829e-7 , 3.073601462233455e-8 , 1.7184021317007137e-8 , 2.0421090596300928e-8 , 7.904992216367646e-9 , 1.6902052379919041e-7 , 1.2416506933732308e-8 , 5.4758292122869534e-8 , 2.6250422280327257e-8 , 1.3261367115546818e-8 , 6.29807459517906e-8 , 1.270998595259698e-8 , 2.0171681569536304e-7 , 4.386637186826192e-8 , 6.962349630157405e-8 , 2.9565120485131047e-7 , 7.925131626507209e-7 , 2.0868920103112032e-7 , 1.7341794489311724e-7 , 4.2942417621816276e-8 , 4.213406956665722e-9 , 8.824785169281313e-8 , 1.7341569957807224e-8 , 7.321587247588468e-8 , 1.7941774288487977e-8 , 1.1245148101579616e-7 , 4.242405395871174e-7 , 8.259573469615589e-9 , 1.1336403105133286e-7 , 8.268798978861014e-8 , 2.2186977588489754e-8 , 1.9539720952366224e-8 , 1.0675703876472653e-8 , 3.288517547161973e-8 , 2.4340963022950746e-8 , 6.639137239972115e-8 , 5.604687380866835e-9 , 1.386604697728444e-8 , 6.675873720496384e-8 , 1.1355886009312144e-8 , 3.132159633878473e-7 , 3.12451788886392e-8 , 1.502181845580708e-7 , 1.3461754377885882e-8 , 1.8882955998833495e-7 , 4.645742279762999e-8 , 4.6453880742092224e-8 , 7.714453964524637e-9 , 3.5857155467056145e-8 , 7.60832108426257e-9 , 4.221501370693659e-8 , 4.3407251126836854e-9 , 1.340157496088068e-8 , 8.565600495558101e-8 , 1.7045413969185574e-8 , 5.4221903411644234e-8 , 3.021912675649219e-8 , 6.153376119755194e-8 , 3.938857240370908e-9 , 4.135628017820636e-8 , 1.781920389021252e-8 , 4.3105885083605244e-8 , 3.903354972578654e-9 , 7.663085455078544e-8 , 1.1890405993142394e-8 , 9.304217840622186e-9 , 1.0968062014171664e-9 , 1.0536767902635802e-8 , 1.1516804221400889e-7 , 8.134522886393825e-7 , 5.952623993721318e-8 , 2.806350174466843e-8 , 1.2833099027886874e-8 , 1.0605690192733164e-7 , 7.872949936427176e-7 , 2.7501393162765453e-8 , 3.936289072470345e-9 , 2.0519442145428002e-8 , 7.394815870753746e-9 , 3.598397313453461e-8 , 2.5378517065632877e-8 , 4.698972233541099e-8 , 7.54952989012736e-9 , 6.322805461422831e-7 , 5.582006412652163e-9 , 1.29640980617296e-7 , 1.5874988434916304e-8 , 3.3837810775594335e-8 , 6.474512037613067e-9 , 9.121148281110436e-8 , 1.3918511676536127e-8 , 8.230025549949005e-9 , 2.7061290097663004e-8 , 2.6095918315149902e-8 , 5.722363471960534e-9 , 6.963475698285038e-7 , 4.685091781198025e-8 , 9.590579885809802e-9 , 2.099205858030473e-7 , 3.082160660028421e-8 , 3.563162565001221e-8 , 7.326312925215461e-7 , 0.0000021759731225756695 , 2.407518309155421e-7 , 2.974515780351794e-7 , 2.529018416908002e-8 , 7.667950718825978e-9 , 2.663289251358947e-7 , 3.4358880185436647e-8 , 2.3130198201215535e-8 , 3.1239693498719134e-8 , 2.8691621878351725e-7 , 3.895845068768722e-8 , 2.4184130253956937e-8 , 1.1582445225144511e-8 , 5.1545349322168477e-8 , 2.034345492063494e-8 , 8.201963197507212e-8 , 1.164153573540716e-8 , 5.496356720868789e-7 , 1.1682151246361627e-8 , 4.7576914852243135e-8 , 1.6349824605299546e-8 , 4.090862759653646e-8 , 2.1271189609706198e-7 , 1.6697286753242224e-7 , 3.989708119433999e-8 , 0.000002852450279533514 , 1.2500372292834072e-7 , 2.4846613655427063e-7 , 1.245429093188477e-8 , 2.9700272463628608e-8 , 4.250991558762962e-9 , 1.61443480806156e-7 , 2.6386018703306036e-7 , 7.638056409575711e-9 , 3.4455793773702226e-9 , 7.273289526210647e-8 , 1.7631434090503717e-8 , 7.58661311550668e-9 , 2.1547013062672704e-8 , 1.2675349125856883e-7 , 2.5637149292379036e-8 , 3.500976220038865e-8 , 6.472243541111311e-8 , 8.387915251262257e-9 , 3.069512288789156e-8 , 7.520387867998579e-8 , 1.5724964441687916e-7 , 1.9634005354873807e-7 , 1.2290831818972947e-7 , 1.112118730439704e-9 , 1.546895944670723e-8 , 9.91701032404535e-9 , 6.882473257974198e-7 , 8.267616635748709e-8 , 4.469531234008173e-8 , 2.075201344098332e-8 , 8.649378457903367e-8 , 5.202766573120243e-8 , 4.5564942041664835e-8 , 2.0319955496006514e-8 , 8.705182352741758e-9 , 6.452066969586667e-8 , 2.1777438519166026e-8 , 1.030954166481024e-8 , 3.211904342492744e-8 , 2.3336936294526822e-7 , 8.054096056753224e-9 , 1.9623354319264763e-7 , 1.2888089884199871e-7 , 1.5392496166555247e-8 , 1.401903038100727e-9 , 5.696818305978013e-8 , 6.080025372057207e-9 , 1.0782793324892737e-8 , 2.4260730313585555e-8 , 1.9388659566743627e-8 , 2.2970310453729326e-7 , 1.9971754028347277e-8 , 2.8477993296860404e-8 , 5.2273552597625894e-8 , 2.7392806600801123e-7 , 9.857291161097237e-8 , 3.12910977129377e-8 , 4.151442212219081e-8 , 5.251196366629074e-9 , 0.000001580681100676884 , 8.547603442821128e-7 , 1.068913135782168e-8 , 0.0000010621830597301596 , 7.737313012512459e-8 , 6.394216711669287e-8 , 1.1698345758759388e-7 , 1.0486609625104393e-7 , 2.1161000063329993e-7 , 1.53396815250062e-8 , 5.094453570109181e-8 , 1.4005379966874898e-8 , 2.6282036102998063e-8 , 8.778433624456738e-8 , 7.772066545896905e-9 , 4.228875383205377e-8 , 3.3243779284930497e-7 , 7.729244799747903e-8 , 7.636901111496286e-10 , 5.989500806435899e-8 , 1.326090597331131e-7 , 1.2853634245857393e-7 , 8.844242671557367e-9 , 1.0194374766570036e-7 , 2.493779334145074e-7 , 1.6547971881664125e-7 , 1.1762754326127833e-8 , 1.1496195639892903e-7 , 2.9342709240154363e-7 , 1.326124099421122e-8 , 8.630262726683213e-8 , 5.7394842656322e-8 , 1.1094081031615133e-7 , 2.2933713239581266e-7 , 3.4706170026765903e-7 , 1.4751107357824367e-7 , 1.502495017291494e-8 , 6.454319390059027e-8 , 5.164533689594464e-8 , 6.23741556182722e-8 , 1.293601457064142e-7 , 1.4052071506398534e-8 , 5.386946000385251e-8 , 2.0827554791935654e-8 , 1.3040637902861363e-8 , 1.0578981601838677e-7 , 1.5079727688771527e-8 , 8.92632726845477e-7 , 4.6374381668101705e-8 , 7.481006036869076e-7 , 5.883147302654379e-9 , 2.8707685117979054e-9 , 8.381598490814213e-7 , 7.341958596640552e-9 , 1.4245998158912698e-8 , 1.0926417104428765e-7 , 1.1308178216040687e-7 , 2.52339901862797e-7 , 1.1782835684925885e-7 , 4.6678056975224536e-8 , 2.7959197179683315e-9 , 3.4363861090014325e-8 , 1.4674496640054713e-7 , 3.5396915620822256e-8 , 2.0581127557761647e-7 , 7.18387909159901e-8 , 2.7693943138729082e-8 , 4.5493386835460115e-8 , 1.9559182717898693e-8 , 1.5359708172013598e-8 , 1.2336623278486059e-8 , 2.9570605519779747e-8 , 2.877552560676122e-7 , 9.051845495378075e-7 , 2.3732602016934834e-7 , 1.6521676471370483e-8 , 1.5478875070584763e-8 , 3.526786329643983e-8 , 3.616410637619083e-8 , 1.61590953950963e-8 , 7.65007328595857e-8 , 1.9661483108279754e-8 , 4.917534823789538e-8 , 1.1712612746350715e-7 , 1.0889253054813253e-8 , 0.000001494120169809321 , 1.018585660261806e-8 , 3.7575969003000864e-8 , 2.097097784314883e-8 , 3.368558054717141e-8 , 4.845588819080149e-9 , 6.039624622644624e-7 , 1.037331109898787e-8 , 2.841650257323636e-7 , 4.4990630954089283e-7 , 3.463186004637464e-8 , 7.720684180867465e-8 , 1.471122175189521e-7 , 1.1601575522490748e-7 , 4.007488030310924e-7 , 3.025649775167949e-8 , 6.706784461130155e-8 , 2.0128741340386114e-8 , 1.5987744461654074e-9 , 4.1919822280078733e-8 , 1.3167154477855547e-8 , 3.231814815762846e-8 , 9.247659704669786e-8 , 1.3075300842047e-7 , 1.0574301256838226e-7 , 3.762165334819656e-8 , 1.0942246575496029e-7 , 7.001474955359299e-8 , 2.742706151082075e-8 , 2.0766625752344225e-8 , 4.5403403703403455e-8 , 3.39040298058535e-8 , 1.0469661759771043e-7 , 2.8271578855765256e-8 , 3.406226767310727e-7 , 5.146206945028098e-7 , 6.740708613506285e-7 , 6.382248063374618e-9 , 3.63878704945364e-8 , 3.626059807970705e-8 , 1.6065602892467723e-7 , 3.639055989879125e-7 , 6.232691696084203e-9 , 4.805490050330263e-8 , 3.372633727849461e-8 , 6.328880317596486e-7 , 6.480631498106959e-8 , 2.1165197949812864e-7 , 8.38779143919055e-8 , 1.7589144363228115e-8 , 2.729027670511641e-9 , 2.144795097080987e-8 , 7.861271456022223e-8 , 2.0118186228046397e-8 , 2.8407685093156942e-8 , 2.4922530883486615e-7 , 2.0156670998972004e-8 , 2.6551649767725394e-8 , 2.7848242822869906e-8 , 6.907123761834555e-9 , 1.880543720744754e-8 , 1.3006903998302732e-8 , 3.685918272822164e-7 , 3.967941211158177e-7 , 2.7592133022835696e-8 , 2.5228947819755376e-8 , 1.547002881352455e-7 , 3.689306637966183e-8 , 1.440177199718562e-9 , 2.1504929392790473e-8 , 5.068111263994979e-8 , 5.081711407228795e-8 , 1.171875219085905e-8 , 5.409278358570191e-8 , 7.138276600926474e-7 , 2.5237213208129106e-7 , 7.072044638789521e-8 , 7.199763984999663e-8 , 1.2525473103153217e-8 , 3.4803417747752974e-7 , 1.9591827538079087e-7 , 1.2404700555634918e-7 , 1.234617457157583e-7 , 1.9201337408958352e-8 , 1.9895249181445251e-7 , 3.7876677794201896e-8 , 1.0629785052174157e-8 , 1.2437127772102485e-8 , 2.1861892207653e-7 , 2.6181456291851646e-7 , 1.112900775979142e-7 , 1.0776630432474121e-7 , 6.380325157095967e-9 , 3.895085143312826e-9 , 1.5762756788717525e-7 , 2.909027019271093e-9 , 1.0381050685737137e-8 , 2.8135211493918177e-8 , 1.0778002490496874e-8 , 1.3605974125141529e-8 , 2.9236465692861202e-8 , 1.9189795352758665e-7 , 2.199506354827463e-7 , 1.326399790002597e-8 , 4.9004846403022384e-8 , 2.980837132682268e-9 , 8.926045680368588e-9 , 1.0996975774446582e-8 , 7.71560149104289e-9 , 7.454491246505768e-9 , 5.086162246925596e-8 , 1.5129764108223753e-7 , 1.1960075596562092e-8 , 1.1323334270230134e-8 , 9.391332156383214e-9 , 9.585701832293125e-8 , 1.905532798218701e-8 , 1.8105303922766325e-8 , 6.179227796110354e-8 , 6.389401363549041e-8 , 1.1853179771037503e-8 , 9.37277544466042e-9 , 1.2332148457971925e-7 , 1.6522022860954166e-8 , 1.246116454467483e-7 , 4.196171854431441e-9 , 3.996593278543514e-8 , 1.2554556505506298e-8 , 1.4302138140465104e-8 , 6.631793780798034e-9 , 5.964224669696705e-9 , 5.556936244488497e-9 , 1.4192455921602232e-7 , 1.7613080771639034e-8 , 3.380189639301534e-7 , 7.85651934620546e-8 , 2.966783085867064e-8 , 0.0000028992105853831163 , 0.0000013787366697215475 , 5.313622430946907e-9 , 2.512852859126724e-8 , 8.406627216572815e-8 , 4.492839167369311e-8 , 5.408793057881667e-8 , 2.4239175999696272e-8 , 4.016805235096399e-7 , 4.1083545454512205e-8 , 5.4153481698904216e-8 , 8.640767212853007e-9 , 5.773256717134245e-8 , 2.6443152023603034e-7 , 8.953217047746875e-7 , 2.7994001783326894e-8 , 5.889480014786841e-9 , 4.1788819515886644e-8 , 2.8880645430717777e-8 , 2.135752907861388e-8 , 2.3024175277441827e-7 , 8.786625471657317e-8 , 2.0697297209437693e-9 , 2.236410523437371e-8 , 3.203276310870251e-9 , 1.176874686592555e-8 , 6.963571053120177e-8 , 2.271932153519174e-8 , 7.360382525689602e-9 , 6.922528772435044e-9 , 3.213871480056696e-8 , 1.370577820125618e-7 , 1.9815049157045905e-8 , 1.0578956377571558e-8 , 2.7049420481262132e-8 , 2.9755937713815683e-9 , 2.1773699288019088e-8 , 1.09755387001087e-8 , 1.991872444762066e-8 , 2.3882098076910552e-8 , 2.1357365653784655e-8 , 6.109098560358461e-9 , 1.1890497475519624e-8 , 1.1459891702259029e-8 , 3.73173456580389e-8 , 1.572620256240498e-8 , 3.404023374287135e-8 , 3.6921580459647885e-8 , 9.281765045443535e-8 , 1.2323201303843234e-7 , 4.2347593876002065e-8 , 1.7423728237986325e-8 , 5.8113389656000436e-8 , 3.931436154402945e-8 , 2.3690461148362374e-8 , 1.792850135018398e-8 , 1.440664210150544e-8 , 7.019830494670032e-9 , 6.041522482291839e-8 , 4.867479930226182e-8 , 1.0685319296044327e-8 , 1.0051243393149889e-8 , 4.2426261614991745e-8 , 2.607815297039906e-8 , 5.136670200300841e-9 , 1.69729952315123e-9 , 1.9131586981302462e-8 , 2.111743526711507e-7 , 1.337269672774255e-8 , 2.0002481448955223e-8 , 1.0454256482717028e-7 , 2.8144228281234973e-8 , 2.1344791889532644e-7 , 2.1046110632028103e-8 , 1.9114453664315079e-7 , 3.957693550660224e-8 , 2.931631826186276e-8 , 1.105203111251285e-7 , 4.84007678380749e-8 , 5.583606110803885e-8 , 1.2130111315400427e-7 , 1.77621615193857e-8 , 2.5610853882085394e-8 , 1.203865309662433e-7 , 4.674859610531712e-9 , 1.5916098661250544e-8 , 3.147594185293201e-8 , 6.147686093527227e-8 , 2.204641802450169e-8 , 3.257763410147163e-7 , 1.198914532096751e-7 , 2.3818989802748547e-7 , 1.4909986134625797e-8 , 5.10168831624469e-8 , 5.5142201915714395e-8 , 2.288550327023131e-8 , 5.714110073995471e-8 , 5.185095801607531e-7 , 4.977285783525076e-8 , 1.1049896109227575e-8 , 1.264099296349741e-7 , 8.174881571676451e-8 ]}]}","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f paddle-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/paddle/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = jay-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can use the example payload jay-v2-grpc.json as the sample input to test the model. Notice that the input format differs from the in the previous REST endpoint example. ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Tue, 10 Oct 2023 14 :55:27 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 190 Estimated response size: 4093 bytes Response contents: { \"modelName\" : \"paddle-v2-resnet50-grpc\" , \"id\" : \"97db0c56-95d2-4171-afd5-f7609a87032d\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"1000\" ] , \"contents\" : { \"fp32Contents\" : [ 6 .7366788e-9,1.1535991e-8,5.1422507e-8,6.6471706e-8,4.0944926e-8,1.3402452e-7,9.355561e-8,2.8935892e-8,6.845367e-8,7.680616e-8,0.000002033469,0.0000011085679,2.3477592e-7,6.582037e-7,0.00012373104,4.2878804e-7,0.00000641996,0.99934965,0.000073720024,0.000031011357,0.0000056028093,0.0000021862509,1.9544045e-8,3.728894e-7,4.2903633e-7,1.825118e-7,7.159926e-8,9.231618e-9,6.4692415e-7,7.0316903e-9,4.4512316e-8,1.2455972e-7,9.4463275e-8,4.3477044e-8,4.65822e-7,6.797721e-8,2.1060276e-7,2.2605123e-8,1.431149e-7,7.9512986e-8,1.2341783e-7,0.0000010921714,0.000015243892,3.1173343e-7,2.4152058e-7,6.8637625e-8,8.467682e-8,9.424677e-8,1.021921e-8,3.3770753e-8,3.6928835e-8,1.3694032e-7,1.06742846e-7,2.5994837e-7,3.4866406e-7,3.1320535e-8,3.5748732e-7,6.648439e-8,3.1638956e-7,0.0000012095878,8.66409e-8,4.0144172e-8,1.2544761e-7,3.320118e-8,1.9731445e-7,3.8064056e-7,1.3827865e-7,2.300226e-8,7.144225e-8,2.8511145e-8,2.9825674e-8,8.936033e-8,6.2238837e-7,6.478839e-8,1.3663023e-7,9.9731814e-8,2.5761555e-8,4.13022e-8,3.9384464e-8,1.215808e-7,0.0000043028217,0.0000018179063,0.0000018520155,0.0000016246107,0.000016448314,0.000010544916,0.000003993062,2.6464798e-7,0.000019193476,4.803243e-7,1.696285e-7,0.0000045505058,0.000042359294,0.0000044433386,0.0000051040097,0.0000135063965,4.1758724e-7,4.4944915e-7,3.1566984e-7,0.00000105576,1.3364639e-8,1.389366e-8,6.7703795e-8,1.4129697e-7,7.170519e-8,7.9344666e-8,2.6391543e-8,2.6134321e-8,7.196726e-9,2.1752363e-8,6.68464e-8,3.417796e-8,1.6228276e-7,4.1071146e-7,6.4721354e-7,2.9513794e-7,5.653474e-9,4.8301445e-8,8.887482e-9,3.730617e-8,1.7784265e-8,4.641905e-9,3.4131187e-8,1.9373938e-7,0.0000012980177,3.5641005e-8,2.1493324e-8,3.0552937e-7,1.5532517e-7,0.0000014520979,3.4884646e-8,0.000038254384,4.5088433e-7,4.176697e-7,6.7706225e-7,1.4142249e-7,0.000014235998,6.2938204e-7,0.000004762867,9.0249006e-7,9.058988e-7,0.0000015713684,1.5720647e-7,1.8185365e-7,7.1931886e-8,0.0000011952825,8.8748374e-7,2.0870831e-7,9.906239e-8,7.793622e-9,1.00584984e-7,4.205944e-7,1.8436246e-7,1.6437947e-7,7.025353e-8,2.5704486e-7,7.5868776e-8,7.841314e-7,2.4953093e-7,5.157682e-8,4.0674127e-8,7.5317965e-9,4.7974854e-8,1.7419973e-8,1.7958679e-7,1.2566392e-8,8.97544e-8,3.2696548e-8,1.120836e-7,3.9067462e-8,4.6769046e-8,1.8523554e-7,1.4833053e-7,1.227935e-7,0.0000010729105,3.653849e-9,1.6198403e-7,1.619072e-8,1.2004934e-7,1.48002774e-8,4.0229484e-8,2.150609e-7,1.1925697e-7,4.8982514e-8,7.60892e-8,2.313748e-8,8.52105e-8,9.5862134e-8,1.3351651e-7,3.021699e-8,4.4238764e-8,2.610667e-8,2.397709e-7,1.3192565e-7,1.6734932e-8,1.588337e-7,4.0643516e-7,8.7534545e-8,8.3669994e-7,3.4375987e-8,7.847893e-8,8.526395e-9,9.601383e-8,5.258924e-7,1.3557448e-7,1.0307227e-7,1.04298135e-8,5.1877144e-8,2.1870013e-8,1.179144e-8,2.9806564e-8,4.3383935e-8,2.9991046e-8,2.850761e-8,3.058665e-8,6.441099e-8,1.5364101e-8,1.5973884e-8,2.573685e-8,1.0903766e-7,3.2118738e-8,6.819743e-9,1.9251311e-7,5.825811e-8,1.8765762e-7,4.007079e-7,1.5791578e-8,1.9501584e-7,1.0142063e-8,2.744815e-8,1.2843532e-8,3.7297493e-8,7.407497e-8,4.2060783e-8,1.6924805e-8,1.4592035e-7,4.344977e-8,1.7191404e-7,3.5817443e-8,8.440249e-9,4.1948297e-8,2.5140324e-8,2.83402e-8,8.747196e-8,8.277126e-9,1.1676294e-8,1.4548515e-7,7.200282e-9,0.000002623601,5.675737e-7,0.0000019483527,6.752595e-8,8.168475e-8,1.09330465e-7,1.6709137e-7,3.1387277e-8,2.9735245e-8,5.752164e-8,5.8508775e-8,3.2544622e-7,3.3302214e-8,4.1867867e-7,1.5085907e-7,2.334682e-7,2.8640278e-7,2.294032e-7,1.8537604e-7,3.1517982e-7,0.0000011075967,1.5369783e-7,1.923751e-7, 1 .6404466e-7,2.9008353e-7,1.2466549e-7,5.802622e-8,5.1862205e-8,6.0094205e-9,1.2333241e-7,1.3798474e-7,1.7370232e-7,5.617761e-7,5.160447e-8,4.8132776e-8,8.0326984e-8,0.0000020645264,5.6385977e-7,8.7942e-7,0.000003478598,2.9723893e-7,3.3904533e-7,9.469074e-8,3.7548457e-8,1.5679038e-7,8.203105e-8,6.8479626e-9,1.8251624e-8,6.050241e-8,3.9563428e-8,1.0699948e-7,3.2566635e-7,3.536943e-7,7.326295e-8,4.857656e-7,7.7177134e-7,3.456778e-8,3.2462046e-7,0.0000031608602,5.3309947e-8,3.6456873e-7,5.4815894e-7,4.6230696e-8,1.3466178e-7,4.3529482e-8,1.6404105e-7,2.4636954e-8,5.9587126e-8,9.493651e-8,5.5234626e-8,5.7412358e-8,0.00001185035,0.0000058263945,0.0000074208674,9.127966e-7,0.0000020019581,0.000001033499,3.514685e-8,0.0000020589953,3.565551e-7,6.873234e-8,2.1935298e-9,5.5603635e-8,3.3266997e-7,1.3073692e-7,2.718763e-8,1.0462929e-8,7.4666804e-7,6.923166e-8,1.6145664e-8,8.568521e-9,4.76221e-9,1.2339771e-7,8.340629e-9,3.2649041e-9,5.063249e-9,4.0704995e-9,1.2043539e-8,5.1056084e-9,7.267143e-9,1.1845163e-7,7.535579e-8,6.386964e-8,1.6212937e-8,2.6104294e-7,6.9794254e-7,6.647513e-8,7.717493e-7,6.651207e-7,3.3244953e-7,3.707282e-7,3.9956424e-7,6.411632e-8,7.107352e-8,1.6380017e-7,6.876801e-8,3.4624745e-7,2.0256503e-7,6.1961015e-7,2.6841073e-8,6.7203354e-7,0.0000011348341,0.0000018397932,6.3972516e-7,7.257533e-8,4.221391e-7,3.9657925e-7,1.403744e-7,3.2498568e-7,1.5857655e-7,1.1122217e-7,7.391421e-8,3.4232224e-7,5.3979615e-8,8.5172964e-8,0.00000406101,0.0000144787555,7.3170328e-9,6.948496e-9,4.4689173e-8,9.231412e-8,5.4119823e-8,2.2242811e-7,1.7609555e-8,2.090628e-8,3.6797683e-9,6.17792e-8,1.7920289e-7,2.627918e-8,2.69882e-8,1.6432807e-7,1.2827613e-7,4.4689088e-8,6.316553e-8,1.946176e-8,2.0871258e-8,2.241458e-8,2.4765244e-8,6.7853985e-9,2.4248795e-8,4.5549795e-9,2.8977038e-8,2.0402325e-8,1.6009503e-7,2.019971e-7,1.6111885e-8,5.964114e-8,4.0983186e-9,3.9080128e-8,7.511338e-9,5.965624e-7,1.6478224e-7,1.4106989e-8,3.2855585e-8,3.3387166e-9,1.2200434e-8,4.624639e-8,6.8423094e-9,1.7426288e-8,4.661133e-8,9.331948e-8,1.2306079e-7,1.2359445e-8,1.1173199e-8,2.7724862e-8,2.4192101e-7,3.4511868e-7,2.593767e-8,9.964568e-8,9.79781e-9,1.9085564e-7,3.9727063e-8,2.6639205e-8,6.874149e-9,3.1469938e-8,2.4086594e-7,1.3126927e-7,2.125434e-7,2.0502034e-8,3.694976e-8,6.563176e-7,2.5600501e-8,2.6882981e-8,6.880636e-7,2.0092733e-7,2.7880397e-8,2.6284091e-8,5.1678345e-8,1.8935414e-7,4.6185284e-7,1.1086778e-8,1.4542604e-7,2.873701e-8,6.105168e-7,1.2016463e-8,1.3944705e-7,2.0937128e-8,4.380141e-8,1.9663208e-8,6.654449e-9,1.1149591e-8,6.424939e-8,6.971555e-9,3.2600196e-9,1.42601895e-8,2.7895078e-8,8.115783e-8,2.5995716e-8,2.2855579e-8,1.05596285e-7,8.1455426e-8,3.7793686e-8,4.8818915e-8,2.3420624e-8,1.0599355e-8,3.604105e-8,5.062431e-8,3.680444e-8,1.5015802e-9,0.0000014475033,0.0000010762104,1.3049913e-7,3.0736015e-8,1.7184021e-8,2.042109e-8,7.904992e-9,1.6902052e-7,1.2416507e-8,5.4758292e-8,2.6250422e-8,1.3261367e-8,6.2980746e-8,1.2709986e-8,2.0171682e-7,4.3866372e-8,6.9623496e-8,2.956512e-7,7.9251316e-7,2.086892e-7,1.7341794e-7,4.2942418e-8,4.213407e-9,8.824785e-8,1.734157e-8,7.321587e-8,1.7941774e-8,1.1245148e-7,4.2424054e-7,8.2595735e-9,1.1336403e-7,8.268799e-8,2.2186978e-8,1.9539721e-8,1.0675704e-8,3.2885175e-8,2.4340963e-8,6.639137e-8,5.6046874e-9,1.3866047e-8,6.675874e-8,1.1355886e-8,3.1321596e-7,3.124518e-8,1.5021818e-7,1.3461754e-8,1.8882956e-7,4.6457423e-8,4.645388e-8,7.714454e-9,3.5857155e-8,7.608321e-9,4.2215014e-8,4.340725e-9,1.3401575e-8,8.5656005e-8,1.7045414e-8,5.4221903e-8,3.0219127e-8,6.153376e-8,3.9388572e-9, 4 .135628e-8,1.7819204e-8,4.3105885e-8,3.903355e-9,7.6630855e-8,1.1890406e-8,9.304218e-9,1.0968062e-9,1.0536768e-8,1.1516804e-7,8.134523e-7,5.952624e-8,2.8063502e-8,1.2833099e-8,1.060569e-7,7.87295e-7,2.7501393e-8,3.936289e-9,2.0519442e-8,7.394816e-9,3.5983973e-8,2.5378517e-8,4.6989722e-8,7.54953e-9,6.3228055e-7,5.5820064e-9,1.2964098e-7,1.5874988e-8,3.383781e-8,6.474512e-9,9.121148e-8,1.3918512e-8,8.2300255e-9,2.706129e-8,2.6095918e-8,5.7223635e-9,6.9634757e-7,4.6850918e-8,9.59058e-9,2.0992059e-7,3.0821607e-8,3.5631626e-8,7.326313e-7,0.0000021759731,2.4075183e-7,2.9745158e-7,2.5290184e-8,7.667951e-9,2.6632893e-7,3.435888e-8,2.3130198e-8,3.1239693e-8,2.8691622e-7,3.895845e-8,2.418413e-8,1.1582445e-8,5.154535e-8,2.0343455e-8,8.201963e-8,1.1641536e-8,5.496357e-7,1.1682151e-8,4.7576915e-8,1.6349825e-8,4.0908628e-8,2.127119e-7,1.6697287e-7,3.989708e-8,0.0000028524503,1.2500372e-7,2.4846614e-7,1.2454291e-8,2.9700272e-8,4.2509916e-9,1.6144348e-7,2.638602e-7,7.638056e-9,3.4455794e-9,7.2732895e-8,1.7631434e-8,7.586613e-9,2.1547013e-8,1.2675349e-7,2.563715e-8,3.5009762e-8,6.4722435e-8,8.387915e-9,3.0695123e-8,7.520388e-8,1.5724964e-7,1.9634005e-7,1.2290832e-7,1.1121187e-9,1.546896e-8,9.91701e-9,6.882473e-7,8.2676166e-8,4.4695312e-8,2.0752013e-8,8.6493785e-8,5.2027666e-8,4.5564942e-8,2.0319955e-8,8.705182e-9,6.452067e-8,2.1777439e-8,1.0309542e-8,3.2119043e-8,2.3336936e-7,8.054096e-9,1.9623354e-7,1.288809e-7,1.5392496e-8,1.401903e-9,5.6968183e-8,6.0800254e-9,1.0782793e-8,2.426073e-8,1.938866e-8,2.297031e-7,1.9971754e-8,2.8477993e-8,5.2273553e-8,2.7392807e-7,9.857291e-8,3.1291098e-8,4.1514422e-8,5.2511964e-9,0.0000015806811,8.5476034e-7,1.0689131e-8,0.0000010621831,7.737313e-8,6.394217e-8,1.1698346e-7,1.04866096e-7,2.1161e-7,1.5339682e-8,5.0944536e-8,1.400538e-8,2.6282036e-8,8.7784336e-8,7.7720665e-9,4.2288754e-8,3.324378e-7,7.729245e-8,7.636901e-10,5.989501e-8,1.3260906e-7,1.2853634e-7,8.844243e-9,1.0194375e-7,2.4937793e-7,1.6547972e-7,1.1762754e-8,1.14961956e-7,2.934271e-7,1.3261241e-8,8.630263e-8,5.7394843e-8,1.1094081e-7,2.2933713e-7,3.470617e-7,1.4751107e-7,1.502495e-8,6.4543194e-8,5.1645337e-8,6.2374156e-8,1.2936015e-7,1.40520715e-8,5.386946e-8,2.0827555e-8,1.3040638e-8,1.05789816e-7,1.5079728e-8,8.926327e-7,4.637438e-8,7.481006e-7,5.8831473e-9,2.8707685e-9,8.3815985e-7,7.3419586e-9,1.4245998e-8,1.0926417e-7,1.1308178e-7,2.523399e-7,1.1782836e-7,4.6678057e-8,2.7959197e-9,3.436386e-8,1.4674497e-7,3.5396916e-8,2.0581128e-7,7.183879e-8,2.7693943e-8,4.5493387e-8,1.9559183e-8,1.5359708e-8,1.2336623e-8,2.9570606e-8,2.8775526e-7,9.0518455e-7,2.3732602e-7,1.6521676e-8,1.5478875e-8,3.5267863e-8,3.6164106e-8,1.6159095e-8,7.650073e-8,1.9661483e-8,4.917535e-8,1.1712613e-7,1.0889253e-8,0.0000014941202,1.0185857e-8,3.757597e-8,2.0970978e-8,3.368558e-8,4.845589e-9,6.0396246e-7,1.0373311e-8,2.8416503e-7,4.499063e-7,3.463186e-8,7.720684e-8,1.4711222e-7,1.16015755e-7,4.007488e-7,3.0256498e-8,6.7067845e-8,2.0128741e-8,1.5987744e-9,4.1919822e-8,1.31671545e-8,3.2318148e-8,9.24766e-8,1.3075301e-7,1.0574301e-7,3.7621653e-8,1.09422466e-7,7.001475e-8,2.7427062e-8,2.0766626e-8,4.5403404e-8,3.390403e-8,1.0469662e-7,2.8271579e-8,3.4062268e-7,5.146207e-7,6.7407086e-7,6.382248e-9,3.638787e-8,3.6260598e-8,1.6065603e-7,3.639056e-7,6.2326917e-9,4.80549e-8,3.3726337e-8,6.3288803e-7,6.4806315e-8,2.1165198e-7,8.3877914e-8,1.7589144e-8,2.7290277e-9,2.1447951e-8,7.8612715e-8,2.0118186e-8,2.8407685e-8,2.492253e-7,2.0156671e-8,2.655165e-8,2.7848243e-8,6.9071238e-9,1.8805437e-8,1.3006904e-8,3.6859183e-7,3.9679412e-7,2.7592133e-8,2.5228948e-8,1.5470029e-7,3.6893066e-8,1.4401772e-9,2.150493e-8,5.0681113e-8,5.0817114e-8,1.1718752e-8,5.4092784e-8,7.1382766e-7,2.5237213e-7,7.0720446e-8,7.199764e-8,1.2525473e-8,3.4803418e-7,1.9591828e-7,1.24047e-7,1.2346175e-7,1.9201337e-8,1.9895249e-7,3.7876678e-8,1.0629785e-8,1.2437128e-8,2.1861892e-7,2.6181456e-7,1.1129008e-7,1.07766304e-7,6.380325e-9,3.895085e-9,1.5762757e-7,2.909027e-9,1.0381051e-8,2.8135211e-8,1.07780025e-8,1.3605974e-8,2.9236466e-8,1.9189795e-7,2.1995064e-7,1.3263998e-8,4.9004846e-8,2.9808371e-9,8.926046e-9,1.0996976e-8,7.7156015e-9,7.454491e-9,5.0861622e-8,1.5129764e-7,1.1960076e-8,1.1323334e-8,9.391332e-9,9.585702e-8,1.9055328e-8,1.8105304e-8,6.179228e-8,6.3894014e-8,1.185318e-8,9.3727754e-9,1.2332148e-7,1.6522023e-8,1.2461165e-7,4.196172e-9,3.9965933e-8,1.25545565e-8,1.4302138e-8,6.631794e-9,5.9642247e-9,5.5569362e-9,1.4192456e-7,1.761308e-8,3.3801896e-7,7.856519e-8,2.966783e-8,0.0000028992106,0.0000013787367,5.3136224e-9,2.5128529e-8,8.406627e-8,4.492839e-8,5.408793e-8,2.4239176e-8,4.0168052e-7,4.1083545e-8,5.415348e-8,8.640767e-9,5.7732567e-8,2.6443152e-7,8.953217e-7,2.7994002e-8,5.88948e-9,4.178882e-8,2.8880645e-8,2.1357529e-8,2.3024175e-7,8.7866255e-8,2.0697297e-9,2.2364105e-8,3.2032763e-9,1.1768747e-8,6.963571e-8,2.2719322e-8,7.3603825e-9,6.9225288e-9,3.2138715e-8,1.3705778e-7,1.981505e-8,1.0578956e-8,2.704942e-8,2.9755938e-9,2.17737e-8,1.0975539e-8,1.9918724e-8,2.3882098e-8,2.1357366e-8,6.1090986e-9,1.18904975e-8,1.1459892e-8,3.7317346e-8,1.5726203e-8,3.4040234e-8,3.692158e-8,9.281765e-8,1.2323201e-7,4.2347594e-8,1.7423728e-8,5.811339e-8,3.931436e-8,2.3690461e-8,1.7928501e-8,1.4406642e-8,7.0198305e-9,6.0415225e-8,4.86748e-8,1.0685319e-8,1.0051243e-8,4.242626e-8,2.6078153e-8,5.13667e-9,1.6972995e-9,1.9131587e-8,2.1117435e-7,1.3372697e-8,2.0002481e-8,1.04542565e-7,2.8144228e-8,2.1344792e-7,2.104611e-8,1.9114454e-7,3.9576936e-8,2.9316318e-8,1.1052031e-7,4.8400768e-8,5.583606e-8,1.2130111e-7,1.7762162e-8,2.5610854e-8,1.2038653e-7,4.6748596e-9,1.5916099e-8,3.1475942e-8,6.147686e-8,2.2046418e-8,3.2577634e-7,1.1989145e-7,2.381899e-7,1.4909986e-8,5.1016883e-8,5.5142202e-8,2.2885503e-8,5.71411e-8,5.185096e-7,4.9772858e-8,1.1049896e-8,1.2640993e-7,8.1748816e-8 ]} } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/pmml/","text":"Deploy PMML model with InferenceService \u00b6 PMML, or predictive model markup language, is an XML format for describing data mining and statistical models, including inputs to the models, transformations used to prepare data for data mining, and the parameters that define the models themselves. In this example we show how you can serve the PMML format model on InferenceService . Deploy PMML model with V1 protocol \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml Create the InferenceService with above yaml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = pmml-demo INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8081 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 18 Oct 2020 15 :50:02 GMT < server: istio-envoy < x-envoy-upstream-service-time: 12 < * Connection #0 to host localhost left intact { \"predictions\" : [{ 'Species' : 'setosa' , 'Probability_setosa' : 1 .0, 'Probability_versicolor' : 0 .0, 'Probability_virginica' : 0 .0, 'Node_Id' : '2' }]} * Closing connection 0 Deploy the model with Open Inference Protocol \u00b6 Test the Model locally \u00b6 Once you've got your model serialised model.pmml , we can then use KServe Pmml Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Using KServe PMMLServer \u00b6 Pre-requisites \u00b6 Firstly, to use KServe pmml server locally, you will first need to install the pmmlserver runtime package in your local environment. Install OpenJdk-11 . Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install pmmlserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/pmmlserver poetry install Serving model locally \u00b6 The pmmlserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the pmmlserver runtime package installed locally, you should now be ready to start our server as: python3 pmmlserver --model_dir /path/to/model_dir --model_name pmml-v2-iris Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f pmml-v2-iris.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/pmml-v2-iris/infer Expected Output { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f pmml-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"pmml-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"PMML"},{"location":"modelserving/v1beta1/pmml/#deploy-pmml-model-with-inferenceservice","text":"PMML, or predictive model markup language, is an XML format for describing data mining and statistical models, including inputs to the models, transformations used to prepare data for data mining, and the parameters that define the models themselves. In this example we show how you can serve the PMML format model on InferenceService .","title":"Deploy PMML model with InferenceService"},{"location":"modelserving/v1beta1/pmml/#deploy-pmml-model-with-v1-protocol","text":"","title":"Deploy PMML model with V1 protocol"},{"location":"modelserving/v1beta1/pmml/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml Create the InferenceService with above yaml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size.","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/pmml/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = pmml-demo INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8081 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 18 Oct 2020 15 :50:02 GMT < server: istio-envoy < x-envoy-upstream-service-time: 12 < * Connection #0 to host localhost left intact { \"predictions\" : [{ 'Species' : 'setosa' , 'Probability_setosa' : 1 .0, 'Probability_versicolor' : 0 .0, 'Probability_virginica' : 0 .0, 'Node_Id' : '2' }]} * Closing connection 0","title":"Run a prediction"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/pmml/#test-the-model-locally","text":"Once you've got your model serialised model.pmml , we can then use KServe Pmml Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/pmml/#using-kserve-pmmlserver","text":"","title":"Using KServe PMMLServer"},{"location":"modelserving/v1beta1/pmml/#pre-requisites","text":"Firstly, to use KServe pmml server locally, you will first need to install the pmmlserver runtime package in your local environment. Install OpenJdk-11 . Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install pmmlserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/pmmlserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/pmml/#serving-model-locally","text":"The pmmlserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the pmmlserver runtime package installed locally, you should now be ready to start our server as: python3 pmmlserver --model_dir /path/to/model_dir --model_name pmml-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f pmml-v2-iris.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/pmml/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/pmml-v2-iris/infer Expected Output { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f pmml-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/pmml/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"pmml-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/rollout/canary-example/","text":"Canary Rollout Example \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . Create the InferenceService \u00b6 Complete steps 1-3 in the First Inference Service tutorial. Set up a namespace (if not already created), and create an InferenceService. After rolling out the first model, 100% traffic goes to the initial model with service revision 1. Run kubectl get isvc sklearn-iris in the command line to see the amount of traffic routing to the InferenceService under the LATEST column. NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00001 46s 2m39s 70s Update the InferenceService with the canary rollout strategy \u00b6 Add the canaryTrafficPercent field to the predictor component and update the storageUri to use a new/updated model. NOTE: A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 10 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 10 sklearn: storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF After rolling out the canary model, traffic is split between the latest ready revision 2 and the previously rolled out revision 1. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 90 10 sklearn-iris-predictor-default-00001 sklearn-iris-predictor-default-00002 9m19s Check the running pods, you should now see port two pods running for the old and new model and 10% traffic is routed to the new model. Notice revision 1 contains default-0001 in its name, while revision 2 contains default-0002 . kubectl get pods NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 2 /2 Running 0 11m sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 12m Run a prediction \u00b6 Follow the next two steps ( Determine the ingress IP and ports and Perform inference ) in the First Inference Service tutorial. Send more requests to the InferenceService to observe the 10% of traffic that routes to the new revision. Promote the canary model \u00b6 If the canary model is healthy/passes your tests, you can promote it by removing the canaryTrafficPercent field and re-applying the InferenceService custom resource. kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Now all traffic goes to the revision 2 for the new model. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00002 17m The pods for revision generation 1 automatically scales down to 0 as it is no longer getting the traffic. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 1 /2 Terminating 0 17m sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 15m Rollback and pin the previous model \u00b6 You can pin the previous model (model v1, for example) by setting the canaryTrafficPercent to 0 for the current model (model v2, for example). This rolls back from model v2 to model v1 and decreases model v2's traffic to zero. Apply the custom resource to set model v2's traffic to 0%. kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 0 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Check the traffic split, now 100% traffic goes to the previous good model (model v1) for revision generation 1. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 0 sklearn-iris-predictor-default-00001 sklearn-iris-predictor-default-00002 18m The pods for previous revision (model v1) now routes 100% of the traffic to its pods while the new model (model v2) routes 0% traffic to its pods. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 1 /2 Running 0 35s sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 16m Route traffic using a tag \u00b6 You can enable tag based routing by adding the annotation serving.kserve.io/enable-tag-routing , so traffic can be explicitly routed to the canary model (model v2) or the old model (model v1) via a tag in the request URL. Apply model v2 with canaryTrafficPercent: 10 and serving.kserve.io/enable-tag-routing: \"true\" . kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" annotations: serving.kserve.io/enable-tag-routing: \"true\" spec: predictor: canaryTrafficPercent: 10 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Check the InferenceService status to get the canary and previous model URL. kubectl get isvc sklearn-iris -ojsonpath = \"{.status.components.predictor}\" | jq The output should look like Expected Output { \"address\" : { \"url\" : \"http://sklearn-iris-predictor-default.kserve-test.svc.cluster.local\" }, \"latestCreatedRevision\" : \"sklearn-iris-predictor-default-00003\" , \"latestReadyRevision\" : \"sklearn-iris-predictor-default-00003\" , \"latestRolledoutRevision\" : \"sklearn-iris-predictor-default-00001\" , \"previousRolledoutRevision\" : \"sklearn-iris-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 10 , \"revisionName\" : \"sklearn-iris-predictor-default-00003\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-sklearn-iris-predictor-default.kserve-test.example.com\" }, { \"latestRevision\" : false , \"percent\" : 90 , \"revisionName\" : \"sklearn-iris-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-sklearn-iris-predictor-default.kserve-test.example.com\" } ], \"url\" : \"http://sklearn-iris-predictor-default.kserve-test.example.com\" } Since we updated the annotation on the InferenceService , model v2 now corresponds to sklearn-iris-predictor-default-00003 . You can now send the request explicitly to the new model or the previous model by using the tag in the request URL. Use the curl command from Perform inference and add latest- or prev- to the model name to send a tag based request. For example, set the model name and use the following commands to send traffic to each service based on the latest or prev tag. MODEL_NAME = sklearn-iris curl the latest revision curl -v -H \"Host: latest- ${ MODEL_NAME } -predictor-default.kserve-test.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d @./iris-input.json or curl the previous revision curl -v -H \"Host: prev- ${ MODEL_NAME } -predictor-default.kserve-test.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d @./iris-input.json","title":"Canary Example"},{"location":"modelserving/v1beta1/rollout/canary-example/#canary-rollout-example","text":"","title":"Canary Rollout Example"},{"location":"modelserving/v1beta1/rollout/canary-example/#setup","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible .","title":"Setup"},{"location":"modelserving/v1beta1/rollout/canary-example/#create-the-inferenceservice","text":"Complete steps 1-3 in the First Inference Service tutorial. Set up a namespace (if not already created), and create an InferenceService. After rolling out the first model, 100% traffic goes to the initial model with service revision 1. Run kubectl get isvc sklearn-iris in the command line to see the amount of traffic routing to the InferenceService under the LATEST column. NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00001 46s 2m39s 70s","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/rollout/canary-example/#update-the-inferenceservice-with-the-canary-rollout-strategy","text":"Add the canaryTrafficPercent field to the predictor component and update the storageUri to use a new/updated model. NOTE: A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 10 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 10 sklearn: storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF After rolling out the canary model, traffic is split between the latest ready revision 2 and the previously rolled out revision 1. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 90 10 sklearn-iris-predictor-default-00001 sklearn-iris-predictor-default-00002 9m19s Check the running pods, you should now see port two pods running for the old and new model and 10% traffic is routed to the new model. Notice revision 1 contains default-0001 in its name, while revision 2 contains default-0002 . kubectl get pods NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 2 /2 Running 0 11m sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 12m","title":"Update the InferenceService with the canary rollout strategy"},{"location":"modelserving/v1beta1/rollout/canary-example/#run-a-prediction","text":"Follow the next two steps ( Determine the ingress IP and ports and Perform inference ) in the First Inference Service tutorial. Send more requests to the InferenceService to observe the 10% of traffic that routes to the new revision.","title":"Run a prediction"},{"location":"modelserving/v1beta1/rollout/canary-example/#promote-the-canary-model","text":"If the canary model is healthy/passes your tests, you can promote it by removing the canaryTrafficPercent field and re-applying the InferenceService custom resource. kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Now all traffic goes to the revision 2 for the new model. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00002 17m The pods for revision generation 1 automatically scales down to 0 as it is no longer getting the traffic. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 1 /2 Terminating 0 17m sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 15m","title":"Promote the canary model"},{"location":"modelserving/v1beta1/rollout/canary-example/#rollback-and-pin-the-previous-model","text":"You can pin the previous model (model v1, for example) by setting the canaryTrafficPercent to 0 for the current model (model v2, for example). This rolls back from model v2 to model v1 and decreases model v2's traffic to zero. Apply the custom resource to set model v2's traffic to 0%. kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" spec: predictor: canaryTrafficPercent: 0 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Check the traffic split, now 100% traffic goes to the previous good model (model v1) for revision generation 1. kubectl get isvc sklearn-iris NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 0 sklearn-iris-predictor-default-00001 sklearn-iris-predictor-default-00002 18m The pods for previous revision (model v1) now routes 100% of the traffic to its pods while the new model (model v2) routes 0% traffic to its pods. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-00001-deployment-66c5f5b8d5-gmfvj 1 /2 Running 0 35s sklearn-iris-predictor-default-00002-deployment-5bd9ff46f8-shtzd 2 /2 Running 0 16m","title":"Rollback and pin the previous model"},{"location":"modelserving/v1beta1/rollout/canary-example/#route-traffic-using-a-tag","text":"You can enable tag based routing by adding the annotation serving.kserve.io/enable-tag-routing , so traffic can be explicitly routed to the canary model (model v2) or the old model (model v1) via a tag in the request URL. Apply model v2 with canaryTrafficPercent: 10 and serving.kserve.io/enable-tag-routing: \"true\" . kubectl apply -n kserve-test -f - <<EOF apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-iris\" annotations: serving.kserve.io/enable-tag-routing: \"true\" spec: predictor: canaryTrafficPercent: 10 model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/sklearn/1.0/model-2\" EOF Check the InferenceService status to get the canary and previous model URL. kubectl get isvc sklearn-iris -ojsonpath = \"{.status.components.predictor}\" | jq The output should look like Expected Output { \"address\" : { \"url\" : \"http://sklearn-iris-predictor-default.kserve-test.svc.cluster.local\" }, \"latestCreatedRevision\" : \"sklearn-iris-predictor-default-00003\" , \"latestReadyRevision\" : \"sklearn-iris-predictor-default-00003\" , \"latestRolledoutRevision\" : \"sklearn-iris-predictor-default-00001\" , \"previousRolledoutRevision\" : \"sklearn-iris-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 10 , \"revisionName\" : \"sklearn-iris-predictor-default-00003\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-sklearn-iris-predictor-default.kserve-test.example.com\" }, { \"latestRevision\" : false , \"percent\" : 90 , \"revisionName\" : \"sklearn-iris-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-sklearn-iris-predictor-default.kserve-test.example.com\" } ], \"url\" : \"http://sklearn-iris-predictor-default.kserve-test.example.com\" } Since we updated the annotation on the InferenceService , model v2 now corresponds to sklearn-iris-predictor-default-00003 . You can now send the request explicitly to the new model or the previous model by using the tag in the request URL. Use the curl command from Perform inference and add latest- or prev- to the model name to send a tag based request. For example, set the model name and use the following commands to send traffic to each service based on the latest or prev tag. MODEL_NAME = sklearn-iris curl the latest revision curl -v -H \"Host: latest- ${ MODEL_NAME } -predictor-default.kserve-test.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d @./iris-input.json or curl the previous revision curl -v -H \"Host: prev- ${ MODEL_NAME } -predictor-default.kserve-test.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d @./iris-input.json","title":"Route traffic using a tag"},{"location":"modelserving/v1beta1/rollout/canary/","text":"Canary Rollout Strategy \u00b6 KServe supports canary rollouts for inference services. Canary rollouts allow for a new version of an InferenceService to receive a percentage of traffic. Kserve supports a configurable canary rollout strategy with multiple steps. The rollout strategy can also be implemented to rollback to the previous revision if a rollout step fails. KServe automatically tracks the last good revision that was rolled out with 100% traffic. The canaryTrafficPercent field in the component's spec needs to be set with the percentage of traffic that should be routed to the new revision. KServe will then automatically split the traffic between the last good revision and the revision that is currently being rolled out according to the canaryTrafficPercent value. When the first revision of an InferenceService is deployed, it will receive 100% of the traffic. When multiple revisions are deployed, as in step 2, and the canary rollout strategy is configured to route 10% of the traffic to the new revision, 90% of the traffic will go to the LastestRolledoutRevision . If there is an unhealthy or bad revision applied, traffic will not be routed to that bad revision. In step 3, the rollout strategy promotes the LatestReadyRevision from step 2 to the LatestRolledoutRevision . Since it is now promoted, the LatestRolledoutRevision gets 100% of the traffic and is fully rolled out. If a rollback needs to happen, 100% of the traffic will be pinned to the previous healthy/good revision- the PreviousRolledoutRevision .","title":"Canary"},{"location":"modelserving/v1beta1/rollout/canary/#canary-rollout-strategy","text":"KServe supports canary rollouts for inference services. Canary rollouts allow for a new version of an InferenceService to receive a percentage of traffic. Kserve supports a configurable canary rollout strategy with multiple steps. The rollout strategy can also be implemented to rollback to the previous revision if a rollout step fails. KServe automatically tracks the last good revision that was rolled out with 100% traffic. The canaryTrafficPercent field in the component's spec needs to be set with the percentage of traffic that should be routed to the new revision. KServe will then automatically split the traffic between the last good revision and the revision that is currently being rolled out according to the canaryTrafficPercent value. When the first revision of an InferenceService is deployed, it will receive 100% of the traffic. When multiple revisions are deployed, as in step 2, and the canary rollout strategy is configured to route 10% of the traffic to the new revision, 90% of the traffic will go to the LastestRolledoutRevision . If there is an unhealthy or bad revision applied, traffic will not be routed to that bad revision. In step 3, the rollout strategy promotes the LatestReadyRevision from step 2 to the LatestRolledoutRevision . Since it is now promoted, the LatestRolledoutRevision gets 100% of the traffic and is fully rolled out. If a rollback needs to happen, 100% of the traffic will be pinned to the previous healthy/good revision- the PreviousRolledoutRevision .","title":"Canary Rollout Strategy"},{"location":"modelserving/v1beta1/sklearn/v2/","text":"Deploy Scikit-learn models with InferenceService \u00b6 This example walks you through how to deploy a scikit-learn model leveraging the v1beta1 version of the InferenceService CRD. Note that, by default the v1beta1 version will expose your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through Open Inference Protocol . Train the Model \u00b6 The first step will be to train a sample scikit-learn model. Note that this model will be then saved as model.joblib . from sklearn import svm from sklearn import datasets from joblib import dump iris = datasets . load_iris () X , y = iris . data , iris . target clf = svm . SVC ( gamma = 'scale' ) clf . fit ( X , y ) dump ( clf , 'model.joblib' ) Test the Model locally \u00b6 Once you've got your model serialised model.joblib , we can then use KServe Sklearn Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Using KServe SklearnServer \u00b6 Pre-requisites \u00b6 Firstly, to use KServe sklearn server locally, you will first need to install the sklearnserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install sklearnserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/sklearnserver poetry install Serving model locally \u00b6 The sklearnserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the sklearnserver runtime package installed locally, you should now be ready to start our server as: python3 sklearnserver --model_dir /path/to/model_dir --model_name sklearn-v2-iris Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Note that this makes the following assumptions: Your model weights (i.e. your model.joblib file) have already been uploaded to a \"model repository\" (GCS in this example) and can be accessed as gs://seldon-models/sklearn/iris . There is a K8s cluster available, accessible through kubectl . KServe has already been installed in your cluster . kubectl kubectl apply -f sklearn.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/sklearn-v2-iris/infer Expected Output { \"id\" : \"823248cc-d770-4a51-9606-16803395569c\" , \"model_name\" : \"sklearn-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1 , 1 ], \"datatype\" : \"INT64\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f sklearn-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"sklearn-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"sklearn-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Scikit-learn"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-scikit-learn-models-with-inferenceservice","text":"This example walks you through how to deploy a scikit-learn model leveraging the v1beta1 version of the InferenceService CRD. Note that, by default the v1beta1 version will expose your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through Open Inference Protocol .","title":"Deploy Scikit-learn models with InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#train-the-model","text":"The first step will be to train a sample scikit-learn model. Note that this model will be then saved as model.joblib . from sklearn import svm from sklearn import datasets from joblib import dump iris = datasets . load_iris () X , y = iris . data , iris . target clf = svm . SVC ( gamma = 'scale' ) clf . fit ( X , y ) dump ( clf , 'model.joblib' )","title":"Train the Model"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-model-locally","text":"Once you've got your model serialised model.joblib , we can then use KServe Sklearn Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/sklearn/v2/#using-kserve-sklearnserver","text":"","title":"Using KServe SklearnServer"},{"location":"modelserving/v1beta1/sklearn/v2/#pre-requisites","text":"Firstly, to use KServe sklearn server locally, you will first need to install the sklearnserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install sklearnserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/sklearnserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/sklearn/v2/#serving-model-locally","text":"The sklearnserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the sklearnserver runtime package installed locally, you should now be ready to start our server as: python3 sklearnserver --model_dir /path/to/model_dir --model_name sklearn-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Note that this makes the following assumptions: Your model weights (i.e. your model.joblib file) have already been uploaded to a \"model repository\" (GCS in this example) and can be accessed as gs://seldon-models/sklearn/iris . There is a K8s cluster available, accessible through kubectl . KServe has already been installed in your cluster . kubectl kubectl apply -f sklearn.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/sklearn-v2-iris/infer Expected Output { \"id\" : \"823248cc-d770-4a51-9606-16803395569c\" , \"model_name\" : \"sklearn-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1 , 1 ], \"datatype\" : \"INT64\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f sklearn-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"sklearn-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"sklearn-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/spark/","text":"Deploy Spark MLlib model with PMML InferenceService \u00b6 Setup \u00b6 Install pyspark 3.0.x and pyspark2pmml pip install pyspark~ = 3 .0.0 pip install pyspark2pmml Get JPMML-SparkML jar Train a Spark MLlib model and export to PMML file \u00b6 Launch pyspark with --jars to specify the location of the JPMML-SparkML uber-JAR pyspark --jars ./jpmml-sparkml-executable-1.6.3.jar Fitting a Spark ML pipeline: from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import RFormula df = spark . read . csv ( \"Iris.csv\" , header = True , inferSchema = True ) formula = RFormula ( formula = \"Species ~ .\" ) classifier = DecisionTreeClassifier () pipeline = Pipeline ( stages = [ formula , classifier ]) pipelineModel = pipeline . fit ( df ) from pyspark2pmml import PMMLBuilder pmmlBuilder = PMMLBuilder ( sc , df , pipelineModel ) pmmlBuilder . buildFile ( \"DecisionTreeIris.pmml\" ) Upload the DecisionTreeIris.pmml to a GCS bucket. gsutil cp ./DecisionTreeIris.pmml gs:// $BUCKET_NAME /sparkpmml/model.pmml Test the Model locally \u00b6 For testing the model locally, please refer the pmml server documentation . Deploy Spark MLlib model with V1 protocol \u00b6 Create the InferenceService with PMMLServer \u00b6 Create the InferenceService with pmml predictor and specify the storageUri with bucket location you uploaded to New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : model : modelFormat : name : pmml storageUri : gs://kfserving-examples/models/sparkpmml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/sparkpmml Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService custom resource kubectl apply -f spark_pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/spark-pmml created Wait the InferenceService to be ready kubectl wait --for = condition = Ready inferenceservice spark-pmml $ inferenceservice.serving.kserve.io/spark-pmml condition met Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = spark-pmml INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-pmml -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to spark-pmml.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/spark-pmml:predict HTTP/1.1 > Host: spark-pmml.default.35.237.217.209.xip.io > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 07 Mar 2021 19 :32:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 14 < * Connection #0 to host spark-pmml.default.35.237.217.209.xip.io left intact { \"predictions\" : [[ 1 .0, 0 .0, 1 .0, 0 .0 ]]} Deploy the model with Open Inference Protocol \u00b6 Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f spark-v2-iris.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/spark-v2-iris/infer Expected Output { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f spark-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"spark-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Spark MLlib"},{"location":"modelserving/v1beta1/spark/#deploy-spark-mllib-model-with-pmml-inferenceservice","text":"","title":"Deploy Spark MLlib model with PMML InferenceService"},{"location":"modelserving/v1beta1/spark/#setup","text":"Install pyspark 3.0.x and pyspark2pmml pip install pyspark~ = 3 .0.0 pip install pyspark2pmml Get JPMML-SparkML jar","title":"Setup"},{"location":"modelserving/v1beta1/spark/#train-a-spark-mllib-model-and-export-to-pmml-file","text":"Launch pyspark with --jars to specify the location of the JPMML-SparkML uber-JAR pyspark --jars ./jpmml-sparkml-executable-1.6.3.jar Fitting a Spark ML pipeline: from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import RFormula df = spark . read . csv ( \"Iris.csv\" , header = True , inferSchema = True ) formula = RFormula ( formula = \"Species ~ .\" ) classifier = DecisionTreeClassifier () pipeline = Pipeline ( stages = [ formula , classifier ]) pipelineModel = pipeline . fit ( df ) from pyspark2pmml import PMMLBuilder pmmlBuilder = PMMLBuilder ( sc , df , pipelineModel ) pmmlBuilder . buildFile ( \"DecisionTreeIris.pmml\" ) Upload the DecisionTreeIris.pmml to a GCS bucket. gsutil cp ./DecisionTreeIris.pmml gs:// $BUCKET_NAME /sparkpmml/model.pmml","title":"Train a Spark MLlib model and export to PMML file"},{"location":"modelserving/v1beta1/spark/#test-the-model-locally","text":"For testing the model locally, please refer the pmml server documentation .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/spark/#deploy-spark-mllib-model-with-v1-protocol","text":"","title":"Deploy Spark MLlib model with V1 protocol"},{"location":"modelserving/v1beta1/spark/#create-the-inferenceservice-with-pmmlserver","text":"Create the InferenceService with pmml predictor and specify the storageUri with bucket location you uploaded to New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : model : modelFormat : name : pmml storageUri : gs://kfserving-examples/models/sparkpmml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/sparkpmml Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService custom resource kubectl apply -f spark_pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/spark-pmml created Wait the InferenceService to be ready kubectl wait --for = condition = Ready inferenceservice spark-pmml $ inferenceservice.serving.kserve.io/spark-pmml condition met","title":"Create the InferenceService with PMMLServer"},{"location":"modelserving/v1beta1/spark/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = spark-pmml INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-pmml -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to spark-pmml.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/spark-pmml:predict HTTP/1.1 > Host: spark-pmml.default.35.237.217.209.xip.io > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 07 Mar 2021 19 :32:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 14 < * Connection #0 to host spark-pmml.default.35.237.217.209.xip.io left intact { \"predictions\" : [[ 1 .0, 0 .0, 1 .0, 0 .0 ]]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f spark-v2-iris.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/spark/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/spark-v2-iris/infer Expected Output { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f spark-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/spark/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"spark-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/tensorflow/","text":"Deploy Tensorflow Model with InferenceService \u00b6 Create the HTTP InferenceService \u00b6 Create an InferenceService yaml which specifies the framework tensorflow and storageUri that is pointed to a saved tensorflow model , and name it as tensorflow.yaml . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the tensorflow.yaml to create the InferenceService , by default it exposes a HTTP/REST endpoint. kubectl kubectl apply -f tensorflow.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-sample created Wait for the InferenceService to be in ready state kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 100 flower-sample-predictor-default-n9zs6 7m15s Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the inference request input file can be downloaded here . MODEL_NAME = flower-sample INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/tensorflow-sample:predict HTTP/1.1 > Host: tensorflow-sample.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 16201 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 16201 out of 16201 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json < date: Sun, 31 Jan 2021 01 :01:50 GMT < x-envoy-upstream-service-time: 280 < server: istio-envoy < { \"predictions\" : [ { \"scores\" : [ 0 .999114931, 9 .20987877e-05, 0 .000136786213, 0 .000337257545, 0 .000300532585, 1 .84813616e-05 ] , \"prediction\" : 0 , \"key\" : \" 1\" } ] } Canary Rollout \u00b6 Canary rollout is a great way to control the risk of rolling out a new model by first moving a small percent of the traffic to it and then gradually increase the percentage. To run a canary rollout, you can apply the canary.yaml with the canaryTrafficPercent field specified. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" Apply the canary.yaml to create the Canary InferenceService. kubectl kubectl apply -f canary.yaml To verify if the traffic split percentage is applied correctly, you can run the following command: kubectl kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 80 20 flower-sample-predictor-default-n9zs6 flower-sample-predictor-default-2kwtr 7m15s As you can see the traffic is split between the last rolled out revision and the current latest ready revision, KServe automatically tracks the last rolled out(stable) revision for you so you do not need to maintain both default and canary on the InferenceService as in v1alpha2. Create the gRPC InferenceService \u00b6 Create InferenceService which exposes the gRPC port and by default it listens on port 9000. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP Apply grpc.yaml to create the gRPC InferenceService. kubectl kubectl apply -f grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-grpc created Run a prediction \u00b6 We use a python gRPC client for the prediction, so you need to create a python virtual environment and install the tensorflow-serving-api . # The prediction script is written in TensorFlow 1.x pip install tensorflow-serving-api> = 1 .14.0,< 2 .0.0 Run the gRPC prediction script . MODEL_NAME = flower-grpc INPUT_PATH = ./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python grpc_client.py --host $INGRESS_HOST --port $INGRESS_PORT --model $MODEL_NAME --hostname $SERVICE_HOSTNAME --input_path $INPUT_PATH Expected Output ou t pu ts { key : \"key\" value { d t ype : DT_STRING tens or_shape { dim { size : 1 } } s tr i n g_val : \" 1\" } } ou t pu ts { key : \"prediction\" value { d t ype : DT_INT 64 tens or_shape { dim { size : 1 } } i nt 64 _val : 0 } } ou t pu ts { key : \"scores\" value { d t ype : DT_FLOAT tens or_shape { dim { size : 1 } dim { size : 6 } } fl oa t _val : 0.9991149306297302 fl oa t _val : 9.209887502947822e-05 fl oa t _val : 0.00013678647519554943 fl oa t _val : 0.0003372581850271672 fl oa t _val : 0.0003005331673193723 fl oa t _val : 1.848137799242977e-05 } } model_spec { na me : \"flowers-sample\" versio n { value : 1 } sig nature _ na me : \"serving_default\" }","title":"Tensorflow"},{"location":"modelserving/v1beta1/tensorflow/#deploy-tensorflow-model-with-inferenceservice","text":"","title":"Deploy Tensorflow Model with InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#create-the-http-inferenceservice","text":"Create an InferenceService yaml which specifies the framework tensorflow and storageUri that is pointed to a saved tensorflow model , and name it as tensorflow.yaml . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the tensorflow.yaml to create the InferenceService , by default it exposes a HTTP/REST endpoint. kubectl kubectl apply -f tensorflow.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-sample created Wait for the InferenceService to be in ready state kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 100 flower-sample-predictor-default-n9zs6 7m15s","title":"Create the HTTP InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the inference request input file can be downloaded here . MODEL_NAME = flower-sample INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/tensorflow-sample:predict HTTP/1.1 > Host: tensorflow-sample.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 16201 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 16201 out of 16201 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json < date: Sun, 31 Jan 2021 01 :01:50 GMT < x-envoy-upstream-service-time: 280 < server: istio-envoy < { \"predictions\" : [ { \"scores\" : [ 0 .999114931, 9 .20987877e-05, 0 .000136786213, 0 .000337257545, 0 .000300532585, 1 .84813616e-05 ] , \"prediction\" : 0 , \"key\" : \" 1\" } ] }","title":"Run a prediction"},{"location":"modelserving/v1beta1/tensorflow/#canary-rollout","text":"Canary rollout is a great way to control the risk of rolling out a new model by first moving a small percent of the traffic to it and then gradually increase the percentage. To run a canary rollout, you can apply the canary.yaml with the canaryTrafficPercent field specified. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" Apply the canary.yaml to create the Canary InferenceService. kubectl kubectl apply -f canary.yaml To verify if the traffic split percentage is applied correctly, you can run the following command: kubectl kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 80 20 flower-sample-predictor-default-n9zs6 flower-sample-predictor-default-2kwtr 7m15s As you can see the traffic is split between the last rolled out revision and the current latest ready revision, KServe automatically tracks the last rolled out(stable) revision for you so you do not need to maintain both default and canary on the InferenceService as in v1alpha2.","title":"Canary Rollout"},{"location":"modelserving/v1beta1/tensorflow/#create-the-grpc-inferenceservice","text":"Create InferenceService which exposes the gRPC port and by default it listens on port 9000. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP Apply grpc.yaml to create the gRPC InferenceService. kubectl kubectl apply -f grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-grpc created","title":"Create the gRPC InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#run-a-prediction_1","text":"We use a python gRPC client for the prediction, so you need to create a python virtual environment and install the tensorflow-serving-api . # The prediction script is written in TensorFlow 1.x pip install tensorflow-serving-api> = 1 .14.0,< 2 .0.0 Run the gRPC prediction script . MODEL_NAME = flower-grpc INPUT_PATH = ./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python grpc_client.py --host $INGRESS_HOST --port $INGRESS_PORT --model $MODEL_NAME --hostname $SERVICE_HOSTNAME --input_path $INPUT_PATH Expected Output ou t pu ts { key : \"key\" value { d t ype : DT_STRING tens or_shape { dim { size : 1 } } s tr i n g_val : \" 1\" } } ou t pu ts { key : \"prediction\" value { d t ype : DT_INT 64 tens or_shape { dim { size : 1 } } i nt 64 _val : 0 } } ou t pu ts { key : \"scores\" value { d t ype : DT_FLOAT tens or_shape { dim { size : 1 } dim { size : 6 } } fl oa t _val : 0.9991149306297302 fl oa t _val : 9.209887502947822e-05 fl oa t _val : 0.00013678647519554943 fl oa t _val : 0.0003372581850271672 fl oa t _val : 0.0003005331673193723 fl oa t _val : 1.848137799242977e-05 } } model_spec { na me : \"flowers-sample\" versio n { value : 1 } sig nature _ na me : \"serving_default\" }","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/","text":"Deploy a PyTorch Model with TorchServe InferenceService \u00b6 In this example, we deploy a trained PyTorch MNIST model to predict handwritten digits by running an InferenceService with TorchServe runtime which is the default installed serving runtime for PyTorch models. Model interpretability is also an important aspect which helps to understand which of the input features were important for a particular classification. Captum is a model interpretability library. In this example, TorchServe explain endpoint is implemented with Captum's state-of-the-art algorithm, including integrated gradients to provide users with an easy way to understand which features are contributing to the model output. You can refer to the Captum Tutorial for more examples. Create Model Storage with a Model Archive File and Config \u00b6 The KServe/TorchServe integration expects following model store layout. \u251c\u2500\u2500 config \u2502 \u251c\u2500\u2500 config.properties \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161.mar \u2502 \u251c\u2500\u2500 mnist.mar TorchServe provides a utility to package all the model artifacts into a single TorchServe Model Archive File (MAR) . After model artifacts are packaged into a MAR file, you then upload to the model-store under the model storage path. You can store your model and dependent files on remote storage or local persistent volume. The MNIST model and dependent files can be obtained from here . Note For remote storage you can choose to start the example using the prebuilt MNIST MAR file stored on KServe example GCS bucket gs://kfserving-examples/models/torchserve/image_classifier , or generate the MAR file with torch-model-archiver and create the model store on remote storage according to the above layout. torch-model-archiver --model-name mnist --version 1 .0 \\ --model-file model-archiver/model-store/mnist/mnist.py \\ --serialized-file model-archiver/model-store/mnist/mnist_cnn.pt \\ --handler model-archiver/model-store/mnist/mnist_handler.py \\ For PVC user please refer to model archive file generation for auto generation of MAR files with the model and dependent files. TorchServe uses a config.properties file to store configuration. Please see here for more details with the properties supported by the configuration file. The following is a sample file for KServe: inference_address=http://0.0.0.0:8085 management_address=http://0.0.0.0:8085 metrics_address=http://0.0.0.0:8082 grpc_inference_port=7070 grpc_management_port=7071 enable_metrics_api=true metrics_format=prometheus number_of_netty_threads=4 job_queue_size=10 enable_envvars_config=true install_py_dep_per_model=true model_store=/mnt/models/model-store model_snapshot={\"name\":\"startup.cfg\",\"modelCount\":1,\"models\":{\"mnist\":{\"1.0\":{\"defaultVersion\":true,\"marName\":\"mnist.mar\",\"minWorkers\":1,\"maxWorkers\":5,\"batchSize\":1,\"maxBatchDelay\":10,\"responseTimeout\":120}}}} The KServe/TorchServe integration supports KServe v1/v2 REST protocol. In the config.properties , we need to turn on the flag enable_envvars_config to enable setting the KServe envelop using an environment variable. Warning The previous service_envelope property has been deprecated and in the config.properties file use the flag enable_envvars_config=true to enable setting the service envelope at runtime. The requests are converted from KServe inference request format to TorchServe request format and sent to the inference_address configured via local socket. Deploy PyTorch Model with V1 REST Protocol \u00b6 Create the TorchServe InferenceService \u00b6 KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 For deploying the model on CPU, apply the following torchserve.yaml to create the InferenceService . kubectl kubectl apply -f torchserve.yaml New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" For deploying the model on GPU, apply the gpu.yaml to create the GPU InferenceService . kubectl kubectl apply -f gpu.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use image converter to convert the images to base64 byte array, for other models please refer to input request . curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist HTTP/1.1 > Host: torchserve.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 167 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Tue, 27 Oct 2020 08 :26:19 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517 < x-envoy-upstream-service-time: 6 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]} Model Explanation \u00b6 To get model explanation: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/mnist:explain -d @./mnist.json Expected Output { \"explanations\" : [[[[ 0.0005394675730469475 , -0.0022280013123036043 , -0.003416480100841055 , -0.0051329881112415965 , -0.009973864160829985 , -0.004112560908882716 , -0.009223458030656112 , -0.0006676354577291628 , -0.005249806664413386 , -0.0009790519227372953 , -0.0026914653993121195 , -0.0069470097151383995 , -0.00693530415962956 , -0.005973878697847718 , -0.00425042437288857 , 0.0032867281838150977 , -0.004297780258633562 , -0.005643196661192014 , -0.00653025019738562 , -0.0047062916121001185 , -0.0018656628277792628 , -0.0016757477204072532 , -0.0010410417081844845 , -0.0019093520822156726 , -0.004451403461006374 , -0.0008552767257773671 , -0.0027638888169885267 , -0.0 ], [ 0.006971297052106784 , 0.007316855222185687 , 0.012144494329150574 , 0.011477799383288441 , 0.006846725347670252 , 0.01149386176451476 , 0.0045351987881190655 , 0.007038361889638708 , 0.0035855377023272157 , 0.003031419502053957 , -0.0008611575226775316 , -0.0011085224745969223 , -0.0050840743637658534 , 0.009855491784340777 , 0.007220680811043034 , 0.011374285598070253 , 0.007147725481709019 , 0.0037114580912849457 , 0.00030763245479291384 , 0.0018305492665953394 , 0.010106224395114147 , 0.012932881164284687 , 0.008862892007714321 , 0.0070960526615982435 , -0.0015931137903787505 , 0.0036495747329455906 , 0.0002593849391051298 , -0.0 ], [ 0.006467265785857396 , -0.00041793201228071674 , 0.004900316089756856 , 0.002308395474823997 , 0.007859295399592283 , 0.003916404948969494 , 0.005630750246437249 , 0.0043712538044184375 , 0.006128530599133763 , -0.009446321309831246 , -0.014173645867037036 , -0.0062988650915794565 , -0.011473838941118539 , -0.009049151947644047 , -0.0007625645864610934 , -0.013721416630061238 , -0.0005580156670410108 , 0.0033404383756480784 , -0.006693278798487951 , -0.003705084551144756 , 0.005100375089529131 , 5.5276874714401074e-05 , 0.007221745280359063 , -0.00573598303916232 , -0.006836169033785967 , 0.0025401608627538936 , 9.303533912921196e-05 , -0.0 ], [ 0.005914399808621816 , 0.00452643561023696 , 0.003968242261515448 , 0.010422786058967673 , 0.007728358107899074 , 0.01147115923288383 , 0.005683869479056691 , 0.011150670502307374 , 0.008742555292485278 , 0.0032882897575743754 , 0.014841138421861584 , 0.011741228362482451 , 0.0004296862879259221 , -0.0035118140680654854 , -0.006152254410078331 , -0.004925121936901983 , -2.3611205202801947e-06 , 0.029347073037039074 , 0.02901626308947743 , 0.023379353021343398 , 0.004027157620197582 , -0.01677662249919171 , -0.013497255736128979 , 0.006957482854214602 , 0.0018321766800746145 , 0.008277034396684563 , 0.002733405455464871 , -0.0 ], [ 0.0049579739156640065 , -0.002168016158233997 , 0.0020644317321723642 , 0.0020912464240293825 , 0.004719691119907336 , 0.007879231202446626 , 0.010594445898145937 , 0.006533067778982801 , 0.002290214592708113 , -0.0036651114968251986 , 0.010753227423379443 , 0.006402706020466243 , -0.047075193909339695 , -0.08108259303568185 , -0.07646875196692542 , -0.1681834845371156 , -0.1610307396135756 , -0.12010309927453829 , -0.016148831320070896 , -0.009541525999486027 , 0.04575604594761406 , 0.031470966329886635 , 0.02452149438024385 , 0.016594078577569567 , 0.012213591301610382 , -0.002230875840404426 , 0.0036704051254298374 , -0.0 ], [ 0.006410107592414739 , 0.005578283890924384 , 0.001977103461731095 , 0.008935476507124939 , 0.0011305055729953436 , 0.0004946313900665659 , -0.0040266029554395935 , -0.004270765544167256 , -0.010832150944943138 , -0.01653511868336456 , -0.011121302103373972 , -0.42038514526905024 , -0.22874576003118394 , -0.16752936178907055 , -0.17021699697722079 , -0.09998584936787697 , -0.09041117495322142 , -0.10230248444795721 , -0.15260897522094888 , 0.07770835838531896 , -0.0813761125123066 , 0.027556910053932963 , 0.036305965104261866 , 0.03407793793894619 , 0.01212761779302579 , 0.006695133380685627 , 0.005331392748588556 , -0.0 ], [ 0.008342680065996267 , -0.00029249776150416367 , 0.002782130291086583 , 0.0027793744856745373 , 0.0020525102690845407 , 0.003679269934110004 , 0.009373846012918791 , -0.0031751745946300403 , -0.009042846256743316 , 0.0074141593032070775 , -0.02796812516561052 , -0.593171583786029 , -0.4830164472795136 , -0.353860128479443 , -0.256482708704862 , 0.11515586314578445 , 0.12700563162828346 , 0.0022342450630152204 , -0.24673707669992118 , -0.012878340813781437 , 0.16866821780196756 , 0.009739033161051434 , -0.000827843726513152 , -0.0002137320694585577 , -0.004179480126338929 , 0.008454049232317358 , -0.002767934266266998 , -0.0 ], [ 0.007070382982749552 , 0.005342127805750565 , -0.000983984198542354 , 0.007910101170274493 , 0.001266267696096404 , 0.0038575136843053844 , 0.006941130321773131 , -0.015195182020687892 , -0.016954974010578504 , -0.031186444096787943 , -0.031754626467747966 , 0.038918845112017694 , 0.06248943950328597 , 0.07703301092601872 , 0.0438493628024275 , -0.0482404449771698 , -0.08718650815999045 , -0.0014764704694506415 , -0.07426336448916614 , -0.10378029666564882 , 0.008572087846793842 , -0.00017173413848283343 , 0.010058893270893113 , 0.0028410498666004377 , 0.002008290211806285 , 0.011905375389931099 , 0.006071375802943992 , -0.0 ], [ 0.0076080165949142685 , -0.0017127333725310495 , 0.00153128150106188 , 0.0033391793764531563 , 0.005373442509691564 , 0.007207746020295443 , 0.007422946703693544 , -0.00699779191449194 , 0.002395328253696969 , -0.011682618874195954 , -0.012737004464649057 , -0.05379966383523857 , -0.07174960461749053 , -0.03027341304050314 , 0.0019411862216381327 , -0.0205575129473766 , -0.04617091711614171 , -0.017655308106959804 , -0.009297162816368814 , -0.03358572117988279 , -0.1626068444778013 , -0.015874364762085157 , -0.0013736074085577258 , -0.014763439328689378 , 0.00631805792697278 , 0.0021769414283267273 , 0.0023061635006792498 , -0.0 ], [ 0.005569931813561535 , 0.004363218328087518 , 0.00025609463218383973 , 0.009577483244680675 , 0.007257755916229399 , 0.00976284778532342 , -0.006388840235419147 , -0.009017880790555707 , -0.015308709334434867 , -0.016743935775597355 , -0.04372596546189275 , -0.03523469356755156 , -0.017257810114846107 , 0.011960489902313411 , 0.01529079831828911 , -0.020076559119468443 , -0.042792547669901516 , -0.0029492027218867116 , -0.011109560582516062 , -0.12985858077848939 , -0.2262858575494602 , -0.003391725540087574 , -0.03063368684328981 , -0.01353486587575121 , 0.0011140822443932317 , 0.006583451102528798 , 0.005667533945285076 , -0.0 ], [ 0.004056272267155598 , -0.0006394041203204911 , 0.004664893926197093 , 0.010593032387298614 , 0.014750931538689989 , 0.015428721146282149 , 0.012167820222401367 , 0.017604752451202518 , 0.01038886849969188 , 0.020544326931163263 , -0.0004206566917812794 , -0.0037463581359232674 , -0.0024656693040735075 , 0.0026061897697624353 , -0.05186055271869177 , -0.09158655048397382 , 0.022976389912563913 , -0.19851635458461808 , -0.11801281807622972 , -0.29127727790584423 , -0.017138655663803876 , -0.04395515676468641 , -0.019241432506341576 , 0.0011342298743447392 , 0.0030625771422964584 , -0.0002867924892991192 , -0.0017908808807543712 , -0.0 ], [ 0.0030114260660488892 , 0.0020246448273580006 , -0.003293361220376816 , 0.0036965043883218584 , 0.00013185761728146236 , -0.004355610866966878 , -0.006432601921104354 , -0.004148701459814858 , 0.005974553907915845 , -0.0001399233607281906 , 0.010392944122965082 , 0.015693249298693028 , 0.0459528427528407 , -0.013921539948093455 , -0.06615556518538708 , 0.02921438991320325 , -0.16345220625101778 , -0.002130491295590408 , -0.11449749664916867 , -0.030980255589300607 , -0.04804122537359171 , -0.05144994776295644 , 0.005122827412776085 , 0.006464862173908011 , 0.008624278272940246 , 0.0037316228508156427 , 0.0036947794337026706 , -0.0 ], [ 0.0038173843228389405 , -0.0017091931226819494 , -0.0030871869816778068 , 0.002115642501535999 , -0.006926441921580917 , -0.003023077828426468 , -0.014451359520861637 , -0.0020793048380231397 , -0.010948003939342523 , -0.0014460716966395166 , -0.01656990336897737 , 0.003052317148320358 , -0.0026729564809943513 , -0.06360067057346147 , 0.07780985635080599 , -0.1436689936630281 , -0.040817177623437874 , -0.04373367754296477 , -0.18337299150349698 , 0.025295182977407064 , -0.03874921104331938 , -0.002353901742617205 , 0.011772560401335033 , 0.012480994515707569 , 0.006498422579824301 , 0.00632320984076023 , 0.003407169765754805 , -0.0 ], [ 0.00944355257990139 , 0.009242583578688485 , 0.005069860444386138 , 0.012666191449103024 , 0.00941789912565746 , 0.004720427012836104 , 0.007597687789204113 , 0.008679266528089945 , 0.00889322771021875 , -0.0008577904940828809 , 0.0022973860384607604 , 0.025328230809207493 , -0.09908781123080951 , -0.07836626399832172 , -0.1546141264726177 , -0.2582207272050766 , -0.2297524599578219 , -0.29561835103416967 , 0.12048787956671528 , -0.06279365699861471 , -0.03832012404275233 , 0.022910264999199934 , 0.005803508497672737 , -0.003858461926053348 , 0.0039451232171312765 , 0.003858476747495933 , 0.0013034515558609956 , -0.0 ], [ 0.009725756015628606 , -0.0004001101998876524 , 0.006490722835571152 , 0.00800808023631959 , 0.0065880711806331265 , -0.0010264326176194034 , -0.0018914305972878344 , -0.008822522194658438 , -0.016650520788128117 , -0.03254382594389507 , -0.014795713101569494 , -0.05826499837818885 , -0.05165369567511702 , -0.13384277337594377 , -0.22572641373340493 , -0.21584739544668635 , -0.2366836351939208 , 0.14937824076489659 , -0.08127414932170171 , -0.06720440139736879 , -0.0038552732903526744 , 0.0107597891707803 , -5.67453590118174e-05 , 0.0020161340511396244 , -0.000783322694907436 , -0.0006397207517995289 , -0.005291639205010064 , -0.0 ], [ 0.008627543242777584 , 0.007700097300051849 , 0.0020430960246806138 , 0.012949015733198586 , 0.008428709579953574 , 0.001358177022953576 , 0.00421863939925833 , 0.002657580000868709 , -0.007339431957237175 , 0.02008439775442315 , -0.0033717631758033114 , -0.05176633249899187 , -0.013790328758662772 , -0.39102366157050594 , -0.167341447585844 , -0.04813367828213947 , 0.1367781582239039 , -0.04672809260566293 , -0.03237784669978756 , 0.03218068777925178 , 0.02415063765016493 , -0.017849899351200002 , -0.002975675228088795 , -0.004819438014786686 , 0.005106898651831245 , 0.0024278620704227456 , 6.784303333368138e-05 , -0.0 ], [ 0.009644258527009343 , -0.001331907219439711 , -0.0014639718434477777 , 0.008481926798958248 , 0.010278031715467508 , 0.003625808326891529 , -0.01121188617599796 , -0.0010634587872994379 , -0.0002603820881968461 , -0.017985648016990465 , -0.06446652745470374 , 0.07726063173046191 , -0.24739929795334742 , -0.2701855018480216 , -0.08888614776216278 , 0.1373325760136816 , -0.02316068912438066 , -0.042164834956711514 , 0.0009266091344106458 , 0.03141872420427644 , 0.011587728430225652 , 0.0004755143243520787 , 0.005860642609620605 , 0.008979633931394438 , 0.005061734169974005 , 0.003932710387086098 , 0.0015489986106803626 , -0.0 ], [ 0.010998736164377534 , 0.009378969800902604 , 0.00030577045264713074 , 0.0159329353530375 , 0.014849508018911006 , -0.0026513365659554225 , 0.002923303082126996 , 0.01917908707828847 , -0.02338288107991566 , -0.05706674679291175 , 0.009526265752669624 , -0.19945255386401284 , -0.10725519695909647 , -0.3222906835083537 , -0.03857038318412844 , -0.013279804965996065 , -0.046626023244262085 , -0.029299060237210447 , -0.043269580558906555 , -0.03768510002290657 , -0.02255977771908117 , -0.02632588166863199 , -0.014417349488098566 , -0.003077271951572957 , -0.0004973277708010661 , 0.0003475839139671271 , -0.0014522783025903258 , -0.0 ], [ 0.012215315671616316 , -0.001693194176229889 , 0.011365785434529038 , 0.0036964574178487792 , -0.010126738168635003 , -0.025554378647710443 , 0.006538003839811914 , -0.03181759044467965 , -0.016424751042854728 , 0.06177539736110035 , -0.43801735323216856 , -0.29991040815937386 , -0.2516019795363623 , 0.037789523540809 , -0.010948746374759491 , -0.0633901687126727 , -0.005976006160777705 , 0.006035133605976937 , -0.04961632526071937 , -0.04142116972831476 , -0.07558952727782252 , -0.04165176179187153 , -0.02021603856619006 , -0.0027365663096057032 , -0.011145473712733575 , 0.0003566937349350848 , -0.00546472985268321 , -0.0 ], [ 0.008009386447317503 , 0.006831207743885825 , 0.0051306149795546365 , 0.016239014770865052 , 0.020925441734273218 , 0.028344800173195076 , -0.004805080609285047 , -0.01880521614501033 , -0.1272329010865855 , -0.39835936819190537 , -0.09113694760349819 , -0.04061591094832608 , -0.12677021961235907 , 0.015567707226741051 , -0.005615051546243333 , -0.06454044862001587 , 0.0195457674752272 , -0.04219686517155871 , -0.08060569979524296 , 0.027234494361702787 , -0.009152881336047056 , -0.030865118003992217 , -0.005770311060090559 , 0.002905833371986098 , 5.606663556872091e-05 , 0.003209538083839772 , -0.0018588810743365345 , -0.0 ], [ 0.007587008852984699 , -0.0021213639853557625 , 0.0007709558092903736 , 0.013883256128746423 , 0.017328713012428214 , 0.03645357525636198 , -0.04043993335238427 , 0.05730125171252314 , -0.2563293727512057 , -0.11438826083879326 , 0.02662382809034687 , 0.03525271352483709 , 0.04745678120172762 , 0.0336360484090392 , -0.002916635707204059 , -0.17950855098650784 , -0.44161773297052964 , -0.4512180227831197 , -0.4940283106297913 , -0.1970108671285798 , 0.04344323143078066 , -0.012005120444897523 , 0.00987576109166055 , -0.0018336757466252476 , 0.0004913959502151706 , -0.0005409724034216215 , -0.005039223900868212 , -0.0 ], [ 0.00637876531169957 , 0.005189469227685454 , 0.0007676355246000376 , 0.018378100865097655 , 0.015739815031394887 , -0.035524983116512455 , 0.03781006978038308 , 0.28859052096740495 , 0.0726464110153121 , -0.026768468497420147 , 0.06278766200288134 , 0.17897045813699355 , -0.13780371920803108 , -0.14176458123649577 , -0.1733103177731656 , -0.3106508869296763 , 0.04788355140275794 , 0.04235327890285105 , -0.031266625292514394 , -0.016263819217960652 , -0.031388328800811355 , -0.01791363975905968 , -0.012025067979443894 , 0.008335083985905805 , -0.0014386677797296231 , 0.0055376544652972854 , 0.002241522815466253 , -0.0 ], [ 0.007455256326741617 , -0.0009475207572210404 , 0.0020288385162615286 , 0.015399640135796092 , 0.021133843188103074 , -0.019846405097622234 , -0.003162485751163173 , -0.14199005055318842 , -0.044200898667146035 , -0.013395459413208084 , 0.11019680479230103 , -0.014057216041764874 , -0.12553853334447865 , -0.05992513534766256 , 0.06467942189539834 , 0.08866056095907732 , -0.1451321508061849 , -0.07382491447758655 , -0.046961739981080476 , 0.0008943713493160624 , 0.03231044103656507 , 0.00036034241706501196 , -0.011387669277619417 , -0.00014602449257226195 , -0.0021863729003374116 , 0.0018817840156005856 , 0.0037909804578166286 , -0.0 ], [ 0.006511855618626698 , 0.006236866054439829 , -0.001440571166157676 , 0.012795776609942026 , 0.011530545030403624 , 0.03495489377257363 , 0.04792403136095304 , 0.049378583599065225 , 0.03296101702085617 , -0.0005351385876652296 , 0.017744115897640366 , 0.0011656622496764954 , 0.0232845869823761 , -0.0561191397060232 , -0.02854070511118366 , -0.028614174047247348 , -0.007763531086362863 , 0.01823079560098924 , 0.021961392405283622 , -0.009666681805706179 , 0.009547046884328725 , -0.008729943263791338 , 0.006408909680578429 , 0.009794327096359952 , -0.0025825219195515304 , 0.007063559189211571 , 0.007867244119267047 , -0.0 ], [ 0.007936663546039311 , -0.00010710180170593153 , 0.002716512705673228 , 0.0038633557307721487 , -0.0014877316616940372 , -0.0004788143065635909 , 0.012508842248031202 , 0.0045381104608414645 , -0.010650910516128294 , -0.013785341529644855 , -0.034287643221318206 , -0.022152707546335495 , -0.047056481347685974 , -0.032166744564720455 , -0.021551611335278546 , -0.002174962503376043 , 0.024344287130424306 , 0.015579272560525105 , 0.010958169741952194 , -0.010607232913436921 , -0.005548369726118836 , -0.0014630046444242706 , 0.013144180105016433 , 0.0031349366359021916 , 0.0010984887428255974 , 0.005426941473328394 , 0.006566511860044785 , -0.0 ], [ 0.0005529184874606495 , 0.00026139355020588705 , -0.002887623443531047 , 0.0013988462990850632 , 0.00203365139495493 , -0.007276926701775218 , -0.004010419939595932 , 0.017521952161185662 , 0.0006996977433557911 , 0.02083134683611201 , 0.013690533534289498 , -0.005466724359976675 , -0.008857712321334327 , 0.017408578822635818 , 0.0076439343049154425 , 0.0017861314923539985 , 0.007465865707523924 , 0.008034420825988495 , 0.003976298558337994 , 0.00411970637898539 , -0.004572592545819698 , 0.0029563907011979935 , -0.0006382227820088148 , 0.0015153753877889707 , -0.0052626601797995595 , 0.0025664706985019416 , 0.005161751034260073 , -0.0 ], [ 0.0009424280561998445 , -0.0012942360298110595 , 0.0011900868416523343 , 0.000984424113178899 , 0.0020988269382781564 , -0.005870080062890889 , -0.004950484744457169 , 0.003117643454332697 , -0.002509563565777083 , 0.005831604884101081 , 0.009531085216183116 , 0.010030206821909806 , 0.005858190171099734 , 4.9344529936340524e-05 , -0.004027895832421331 , 0.0025436439920587606 , 0.00531153867563076 , 0.00495942692369508 , 0.009215148318606382 , 0.00010011928317543458 , 0.0060051362999805355 , -0.0008195376963202741 , 0.0041728603512658224 , -0.0017597169567888774 , -0.0010577007775543158 , 0.00046033327178068433 , -0.0007674196306044449 , -0.0 ], [ -0.0 , -0.0 , 0.0013386963856532302 , 0.00035183178922260837 , 0.0030610334903526204 , 8.951834979315781e-05 , 0.0023676793550483524 , -0.0002900551076915047 , -0.00207019445286608 , -7.61697478482574e-05 , 0.0012150086715244216 , 0.009831239281792168 , 0.003479667642621962 , 0.0070584324334114525 , 0.004161851261339585 , 0.0026146296354490665 , -9.194746959222099e-05 , 0.0013583866966571571 , 0.0016821551239318913 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 ]]]]} Deploy PyTorch Model with V1 gRPC Protocol \u00b6 Note : Since kserve has no grpc client methods for v1, we are using torchserve's grpc v1 client Create the InferenceService \u00b6 For deploying the InferenceService with gRPC protocol you need to expose the gRPC port on InferenceService. Here 7070 is torchserve gRPC port. Apply the following mnist_grpc.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc.yaml Expected Output inferenceservice.serving.kserve.io/torchserve-grpc created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP Run Inference with TorchServe gRPC protocol \u00b6 Install gRPC python dependencies: pip install -U grpcio protobuf grpcio-tools Download TorchServe's inference and management proto: mkdir -p proto/v1 INFERENCE_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/inference.proto MANAGEMENT_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/management.proto curl -s -L ${ INFERENCE_PROTO_FILE_PATH } > ./proto/v1/inference.proto curl -s -L ${ MANAGEMENT_PROTO_FILE_PATH } > ./proto/v1/management.proto Generate python gRPC client stub using the proto files: python -m grpc_tools.protoc --proto_path = proto/v1/ --python_out = . --grpc_python_out = . proto/v1/inference.proto proto/v1/management.proto You can use image converter to convert the images to base64 byte array, for other models please refer to input request . Run gRPC Inference using torchserve_grpc_client.py with mnist.json as an example prediction input. MODEL_NAME = mnist INPUT_PATH = mnist.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python torchserve_grpc_client.py --api_name infer --model $MODEL_NAME --input_path $INPUT_PATH --hostname $SERVICE_HOSTNAME Expected Output { \"predictions\" : [ 2 ] } Deploy PyTorch model with Open Inference REST Protocol \u00b6 Create the InferenceService \u00b6 KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec and enables the KServe v1 inference protocol. To enable v2 open inference protocol, specify the protocolVersion field with the value v2 . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : pytorch : protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 For deploying the model on CPU, apply the mnist_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_v2.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve-mnist-v2 created Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-mnist-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can send both byte array and tensor with v2 protocol, for byte array use image converter to convert the image to byte array input. Here we use the mnist_v2_bytes.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2_bytes.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} For tensor input use the tensor image converter to convert the image to tensor input and here we use the mnist_v2.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2.json Expected Output { \"id\" : \"2266ec1e-f600-40af-97b5-7429b8195a80\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} Model Explanation \u00b6 To get the model explanation with v2 explain endpoint: MODEL_NAME = mnist curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mnist/explain -d @./mnist_v2.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"explain\" , \"shape\" : [ 1 , 28 , 28 ], \"datatype\" : \"FP64\" , \"data\" : [ -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0040547528781588035 , -0.00022612877200043775 , -0.0001273413606783097 , 0.005648369508785856 , 0.008904784451506994 , 0.0026385365879584796 , 0.0026802458602499875 , -0.002657801604900743 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.00024465772895309256 , 0.0008218449738666515 , 0.015285917610467934 , 0.007512832227517626 , 0.007094984753782517 , 0.003405668751094489 , -0.0020919252360163056 , -0.00078002938659872 , 0.02299587777864007 , 0.01900432942654754 , -0.001252955497754338 , -0.0014666116894338772 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.005298396384926053 , -0.0007901605067151054 , 0.0039060659788228954 , 0.02317408211645009 , 0.017237917554858186 , 0.010867034286601965 , 0.003001563092717309 , 0.00622421762838887 , 0.006120712336480808 , 0.016736329175541464 , 0.005674718838256385 , 0.004344134814439431 , -0.001232842177319105 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0006867353660007012 , 0.00977289933298656 , -0.003875493166540815 , 0.0017986937404117591 , 0.0013075440157543057 , -0.0024510980461748236 , -0.0008806773426546923 , -0.0 , -0.0 , -0.00014277890422995419 , -0.009322313284511257 , 0.020608317953885236 , 0.004351394739722548 , -0.0007875565409186222 , -0.0009075897751127677 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.00022247237111456804 , -0.0007829031603535926 , 0.002666369539125161 , 0.000973336852105775 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.000432321003928822 , 0.023657172129172684 , 0.010694844898905204 , -0.002375952975746018 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0020747972047037 , -0.002320101258915877 , -0.0012899205783904548 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.007629679655402933 , 0.01044862724376463 , 0.00025032878924736025 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.00037708370104137974 , -0.005156369275302328 , 0.0012477582442296628 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -4.442516083381132e-05 , 0.01024804634283815 , 0.0009971135240970147 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0004501048968956462 , -0.0019630535686311007 , -0.0006664793297549408 , 0.0020157403539278907 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0022144569383238466 , 0.008361583574785395 , 0.00314019428604999 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0028943544591141838 , -0.0031301383432286406 , 0.002113252872926688 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0010321050605717045 , 0.008905753926369048 , 0.002846438277738756 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.005305288883499087 , -0.00192711009725932 , 0.0012090042768467344 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0011945156500241256 , 0.005654442715832439 , 0.0020132075345016807 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0014689356969061985 , 0.0010743412638183228 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0017047980586912376 , 0.00290660517425009 , -0.0007805869640505143 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 5.541 * Co nne c t io n # 0 t o hos t localhos t le ft i nta c t 725422148614e-05 , 0.0014516114512869852 , 0.0002827701966546988 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0014401407633627265 , 0.0023812497776698745 , 0.002146825301700187 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0011500529125940918 , 0.0002865015572973405 , 0.0029798151042282686 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0017750295500283872 , 0.0008339859126060243 , -0.00377073933577687 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0006093176894575109 , -0.00046905787892409935 , 0.0034053218511795034 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0007450011768391558 , 0.001298767372877851 , -0.008499247640112315 , -6.145166131400234e-05 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0011809726042792137 , -0.001838476328106708 , 0.00541110661116898 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.002139234224224006 , 0.0003259163407641124 , -0.005276118873855287 , -0.001950984007438105 , -9.545670742026532e-07 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0007772404228681039 , -0.0001517956264720738 , 0.0064814848131711815 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 8.098064985902114e-05 , -0.00249042660692983 , -0.0020718619200672302 , -5.341117902942147e-05 , -0.00045564724429915073 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0022750983476959733 , 0.0017164060958460778 , 0.0003221344707738082 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0015560282678744543 , 9.107238495871273e-05 , 0.0008772841497928399 , 0.0006502978626355868 , -0.004128780767525651 , 0.0006030386900152659 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.001395995791096219 , 0.0026791526689584344 , 0.0023995008266391488 , -0.0004496096312746451 , 0.003101832450753724 , 0.007494536066960778 , 0.0028641187148287965 , -0.0030525907182629075 , 0.003420222396518567 , 0.0014924018363498125 , -0.0009357388301326025 , 0.0007856228933169799 , -0.0018433973914981437 , 1.6031856831240914e-05 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0006999018502034005 , 0.004382250870697946 , -0.0035419313267119365 , -0.0028896748092595375 , -0.00048734542493666705 , -0.0060873452419295 , 0.000388224990424471 , 0.002533641537585585 , -0.004352836563597573 , -0.0006079418766875505 , -0.0038101334053377753 , -0.000828441340357984 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0010901530866342661 , -0.013135008038845744 , 0.0004734518707654666 , 0.002050423283568135 , -0.006609451922460863 , 0.0023647861820124366 , 0.0046789204256194 , -0.0018122527412311837 , 0.002137538353955849 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ]}]} Deploy PyTorch Model with Open Inference gRPC Protocol \u00b6 Create the InferenceService \u00b6 For deploying the InferenceService with Open Inference gRPC Protocol you need to expose the gRPC port on InferenceService. Here 8081 is kserve gRPC port. Apply the following mnist_grpc_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc_v2.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-grpc-v2 created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : pytorch : protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP Run gRPC Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Then download Open Inference gRPC proto file: mkdir -p proto/v2 PROTO_FILE_PATH = https://raw.githubusercontent.com/kserve/kserve/master/python/kserve/kserve/protocol/grpc/grpc_predict_v2.proto curl -s -L ${ PROTO_FILE_PATH } > ./proto/v2/grpc_predict_v2.proto Run the inference test with grpcurl: INPUT_PATH = ./mnist_v2_grpc_tensor.json PROTO_FILE = proto/v2/grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) grpcurl -v -plaintext -proto ${ PROTO_FILE } -authority ${ SERVICE_HOSTNAME } -d @ ${ INGRESS_HOST } : ${ INGRESS_PORT } inference.GRPCInferenceService.ModelInfer <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Wed, 11 Oct 2023 13 :36:30 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 581 Response contents: { \"modelName\" : \"mnist\" , \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"outputs\" : [ { \"name\" : \"input-0\" , \"datatype\" : \"INT64\" , \"shape\" : [ \"1\" ] , \"contents\" : { \"int64Contents\" : [ \"1\" ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response Autoscaling \u00b6 One of the main serverless inference features is to automatically scale the replicas of an InferenceService matching the incoming workload. KServe by default enables Knative Pod Autoscaler which watches traffic flow and scales up and down based on the configured metrics. Knative Autoscaler \u00b6 KServe supports the implementation of Knative Pod Autoscaler (KPA) and Kubernetes\u2019 Horizontal Pod Autoscaler (HPA) . The features and limitations of each of these Autoscalers are listed below. Note If you want to use Kubernetes Horizontal Pod Autoscaler (HPA), you must install HPA extension Knative Pod Autoscaler (KPA) Part of the Knative Serving core and enabled by default once Knative Serving is installed. Supports scale to zero functionality. Does not support CPU-based autoscaling. Horizontal Pod Autoscaler (HPA) Not part of the Knative Serving core, and must be enabled after Knative Serving installation. Does not support scale to zero functionality. Supports CPU-based autoscaling. Create InferenceService with Concurrency Target \u00b6 Hard/Soft Autoscaling Limit \u00b6 You can configure InferenceService with annotation autoscaling.knative.dev/target for a soft limit. The soft limit is a targeted limit rather than a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" You can also configure InferenceService with field containerConcurrency with a hard limit. The hard limit is an enforced upper bound. If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" After specifying the soft or hard limits of the scaling target, you can now deploy the InferenceService with autoscaling.yaml . kubectl kubectl apply -f autoscaling.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created Run Inference with Concurrent Requests \u00b6 The first step is to install the hey load generator and then send the concurrent requests to the InferenceService . go get -u github.com/rakyll/hey MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -m POST -z 30s -D ./mnist.json -host ${ SERVICE_HOSTNAME } http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Check Pod Autoscaling \u00b6 hey by default generates 50 requests concurrently, so you can see that the InferenceService scales to 5 pods as the container concurrency target is set to 10. Expected Output NAME READY STATUS RESTARTS AGE torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb 2 /2 Terminating 0 103s torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8 2 /2 Terminating 0 95s torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq 2 /2 Running 0 50m torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2 /2 Running 0 113s torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2 /2 Running 0 109s torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t 2 /2 Terminating 0 103s Canary Rollout \u00b6 Canary rollout is a deployment strategy when you release a new version of model to a small percent of the production traffic. Create InferenceService with Canary Model \u00b6 After the above experiments, now let's see how you can rollout a new model without moving full traffic to the new model by default. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" In this example we change the storageUri to the v2 version with canaryTrafficPercent field and then apply the canary.yaml . kubectl kubectl apply -f canary.yaml Expected Output kubectl get revisions -l serving.kserve.io/inferenceservice = torchserve NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON ACTUAL REPLICAS DESIRED REPLICAS torchserve-predictor-default-00001 torchserve-predictor-default 1 True 1 1 torchserve-predictor-default-00002 torchserve-predictor-default 2 True 1 1 kubectl get pods -l serving.kserve.io/inferenceservice = torchserve NAME READY STATUS RESTARTS AGE torchserve-predictor-default-00001-deployment-7d99979c99-p49gk 2 /2 Running 0 28m torchserve-predictor-default-00002-deployment-c6fcc65dd-rjknq 2 /2 Running 0 3m37s Check Traffic Status \u00b6 After the canary model is rolled out, the traffic should be split between the canary model revision and the \"stable\" revision which was rolled out with 100% percent traffic, now check the traffic split from the InferenceService traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 20 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 80 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Traffic Rollout \u00b6 Run the following curl requests a few times to the InferenceService , you can see that requests are sent to the two revisions with 20/80 splits. MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) for i in { 1 ..10 } ; do curl -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json ; done Expected Output { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 <html><title>500: Internal Server Error</title><body>500: Internal Server Error</body></html>Handling connection for 8080 <html><title>500: Internal Server Error</title><body>500: Internal Server Error</body></html>Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 You can notice that when the request hits the canary revision it fails, this is because that the new revision requires the v2 inference input mnist_v2.json which is a breaking change, in addition the traffic is randomly splitted between the two revisions according to the specified traffic percentage. In this case you should rollout the canary model with 0 canaryTrafficPercent and use the latest tagged url to test the canary model before moving the full traffic to the new model. kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' curl -v -H \"Host: latest-torchserve-predictor-default.default.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [ 1 ], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} After the new model is tested and verified, you can now bump the canaryTrafficPercent to 100 to fully rollout the traffic to the new revision and now the latestRolledoutRevision becomes torchserve-predictor-default-00002 and previousRolledoutRevision becomes torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 100}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00002\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Rollback the Model \u00b6 In case the new model version does not work after the traffic is moved to the new revision, you can still patch the canaryTrafficPercent to 0 and move the traffic back to the previously rolled model which is torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 0 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Monitoring \u00b6 Metrics Exposure and Grafana Dashboard Setup","title":"PyTorch"},{"location":"modelserving/v1beta1/torchserve/#deploy-a-pytorch-model-with-torchserve-inferenceservice","text":"In this example, we deploy a trained PyTorch MNIST model to predict handwritten digits by running an InferenceService with TorchServe runtime which is the default installed serving runtime for PyTorch models. Model interpretability is also an important aspect which helps to understand which of the input features were important for a particular classification. Captum is a model interpretability library. In this example, TorchServe explain endpoint is implemented with Captum's state-of-the-art algorithm, including integrated gradients to provide users with an easy way to understand which features are contributing to the model output. You can refer to the Captum Tutorial for more examples.","title":"Deploy a PyTorch Model with TorchServe InferenceService"},{"location":"modelserving/v1beta1/torchserve/#create-model-storage-with-a-model-archive-file-and-config","text":"The KServe/TorchServe integration expects following model store layout. \u251c\u2500\u2500 config \u2502 \u251c\u2500\u2500 config.properties \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161.mar \u2502 \u251c\u2500\u2500 mnist.mar TorchServe provides a utility to package all the model artifacts into a single TorchServe Model Archive File (MAR) . After model artifacts are packaged into a MAR file, you then upload to the model-store under the model storage path. You can store your model and dependent files on remote storage or local persistent volume. The MNIST model and dependent files can be obtained from here . Note For remote storage you can choose to start the example using the prebuilt MNIST MAR file stored on KServe example GCS bucket gs://kfserving-examples/models/torchserve/image_classifier , or generate the MAR file with torch-model-archiver and create the model store on remote storage according to the above layout. torch-model-archiver --model-name mnist --version 1 .0 \\ --model-file model-archiver/model-store/mnist/mnist.py \\ --serialized-file model-archiver/model-store/mnist/mnist_cnn.pt \\ --handler model-archiver/model-store/mnist/mnist_handler.py \\ For PVC user please refer to model archive file generation for auto generation of MAR files with the model and dependent files. TorchServe uses a config.properties file to store configuration. Please see here for more details with the properties supported by the configuration file. The following is a sample file for KServe: inference_address=http://0.0.0.0:8085 management_address=http://0.0.0.0:8085 metrics_address=http://0.0.0.0:8082 grpc_inference_port=7070 grpc_management_port=7071 enable_metrics_api=true metrics_format=prometheus number_of_netty_threads=4 job_queue_size=10 enable_envvars_config=true install_py_dep_per_model=true model_store=/mnt/models/model-store model_snapshot={\"name\":\"startup.cfg\",\"modelCount\":1,\"models\":{\"mnist\":{\"1.0\":{\"defaultVersion\":true,\"marName\":\"mnist.mar\",\"minWorkers\":1,\"maxWorkers\":5,\"batchSize\":1,\"maxBatchDelay\":10,\"responseTimeout\":120}}}} The KServe/TorchServe integration supports KServe v1/v2 REST protocol. In the config.properties , we need to turn on the flag enable_envvars_config to enable setting the KServe envelop using an environment variable. Warning The previous service_envelope property has been deprecated and in the config.properties file use the flag enable_envvars_config=true to enable setting the service envelope at runtime. The requests are converted from KServe inference request format to TorchServe request format and sent to the inference_address configured via local socket.","title":"Create Model Storage with a Model Archive File and Config"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-v1-rest-protocol","text":"","title":"Deploy PyTorch Model with V1 REST Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-torchserve-inferenceservice","text":"KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 For deploying the model on CPU, apply the following torchserve.yaml to create the InferenceService . kubectl kubectl apply -f torchserve.yaml New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" For deploying the model on GPU, apply the gpu.yaml to create the GPU InferenceService . kubectl kubectl apply -f gpu.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created","title":"Create the TorchServe InferenceService"},{"location":"modelserving/v1beta1/torchserve/#model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use image converter to convert the images to base64 byte array, for other models please refer to input request . curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist HTTP/1.1 > Host: torchserve.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 167 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Tue, 27 Oct 2020 08 :26:19 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517 < x-envoy-upstream-service-time: 6 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]}","title":"Model Inference"},{"location":"modelserving/v1beta1/torchserve/#model-explanation","text":"To get model explanation: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/mnist:explain -d @./mnist.json Expected Output { \"explanations\" : [[[[ 0.0005394675730469475 , -0.0022280013123036043 , -0.003416480100841055 , -0.0051329881112415965 , -0.009973864160829985 , -0.004112560908882716 , -0.009223458030656112 , -0.0006676354577291628 , -0.005249806664413386 , -0.0009790519227372953 , -0.0026914653993121195 , -0.0069470097151383995 , -0.00693530415962956 , -0.005973878697847718 , -0.00425042437288857 , 0.0032867281838150977 , -0.004297780258633562 , -0.005643196661192014 , -0.00653025019738562 , -0.0047062916121001185 , -0.0018656628277792628 , -0.0016757477204072532 , -0.0010410417081844845 , -0.0019093520822156726 , -0.004451403461006374 , -0.0008552767257773671 , -0.0027638888169885267 , -0.0 ], [ 0.006971297052106784 , 0.007316855222185687 , 0.012144494329150574 , 0.011477799383288441 , 0.006846725347670252 , 0.01149386176451476 , 0.0045351987881190655 , 0.007038361889638708 , 0.0035855377023272157 , 0.003031419502053957 , -0.0008611575226775316 , -0.0011085224745969223 , -0.0050840743637658534 , 0.009855491784340777 , 0.007220680811043034 , 0.011374285598070253 , 0.007147725481709019 , 0.0037114580912849457 , 0.00030763245479291384 , 0.0018305492665953394 , 0.010106224395114147 , 0.012932881164284687 , 0.008862892007714321 , 0.0070960526615982435 , -0.0015931137903787505 , 0.0036495747329455906 , 0.0002593849391051298 , -0.0 ], [ 0.006467265785857396 , -0.00041793201228071674 , 0.004900316089756856 , 0.002308395474823997 , 0.007859295399592283 , 0.003916404948969494 , 0.005630750246437249 , 0.0043712538044184375 , 0.006128530599133763 , -0.009446321309831246 , -0.014173645867037036 , -0.0062988650915794565 , -0.011473838941118539 , -0.009049151947644047 , -0.0007625645864610934 , -0.013721416630061238 , -0.0005580156670410108 , 0.0033404383756480784 , -0.006693278798487951 , -0.003705084551144756 , 0.005100375089529131 , 5.5276874714401074e-05 , 0.007221745280359063 , -0.00573598303916232 , -0.006836169033785967 , 0.0025401608627538936 , 9.303533912921196e-05 , -0.0 ], [ 0.005914399808621816 , 0.00452643561023696 , 0.003968242261515448 , 0.010422786058967673 , 0.007728358107899074 , 0.01147115923288383 , 0.005683869479056691 , 0.011150670502307374 , 0.008742555292485278 , 0.0032882897575743754 , 0.014841138421861584 , 0.011741228362482451 , 0.0004296862879259221 , -0.0035118140680654854 , -0.006152254410078331 , -0.004925121936901983 , -2.3611205202801947e-06 , 0.029347073037039074 , 0.02901626308947743 , 0.023379353021343398 , 0.004027157620197582 , -0.01677662249919171 , -0.013497255736128979 , 0.006957482854214602 , 0.0018321766800746145 , 0.008277034396684563 , 0.002733405455464871 , -0.0 ], [ 0.0049579739156640065 , -0.002168016158233997 , 0.0020644317321723642 , 0.0020912464240293825 , 0.004719691119907336 , 0.007879231202446626 , 0.010594445898145937 , 0.006533067778982801 , 0.002290214592708113 , -0.0036651114968251986 , 0.010753227423379443 , 0.006402706020466243 , -0.047075193909339695 , -0.08108259303568185 , -0.07646875196692542 , -0.1681834845371156 , -0.1610307396135756 , -0.12010309927453829 , -0.016148831320070896 , -0.009541525999486027 , 0.04575604594761406 , 0.031470966329886635 , 0.02452149438024385 , 0.016594078577569567 , 0.012213591301610382 , -0.002230875840404426 , 0.0036704051254298374 , -0.0 ], [ 0.006410107592414739 , 0.005578283890924384 , 0.001977103461731095 , 0.008935476507124939 , 0.0011305055729953436 , 0.0004946313900665659 , -0.0040266029554395935 , -0.004270765544167256 , -0.010832150944943138 , -0.01653511868336456 , -0.011121302103373972 , -0.42038514526905024 , -0.22874576003118394 , -0.16752936178907055 , -0.17021699697722079 , -0.09998584936787697 , -0.09041117495322142 , -0.10230248444795721 , -0.15260897522094888 , 0.07770835838531896 , -0.0813761125123066 , 0.027556910053932963 , 0.036305965104261866 , 0.03407793793894619 , 0.01212761779302579 , 0.006695133380685627 , 0.005331392748588556 , -0.0 ], [ 0.008342680065996267 , -0.00029249776150416367 , 0.002782130291086583 , 0.0027793744856745373 , 0.0020525102690845407 , 0.003679269934110004 , 0.009373846012918791 , -0.0031751745946300403 , -0.009042846256743316 , 0.0074141593032070775 , -0.02796812516561052 , -0.593171583786029 , -0.4830164472795136 , -0.353860128479443 , -0.256482708704862 , 0.11515586314578445 , 0.12700563162828346 , 0.0022342450630152204 , -0.24673707669992118 , -0.012878340813781437 , 0.16866821780196756 , 0.009739033161051434 , -0.000827843726513152 , -0.0002137320694585577 , -0.004179480126338929 , 0.008454049232317358 , -0.002767934266266998 , -0.0 ], [ 0.007070382982749552 , 0.005342127805750565 , -0.000983984198542354 , 0.007910101170274493 , 0.001266267696096404 , 0.0038575136843053844 , 0.006941130321773131 , -0.015195182020687892 , -0.016954974010578504 , -0.031186444096787943 , -0.031754626467747966 , 0.038918845112017694 , 0.06248943950328597 , 0.07703301092601872 , 0.0438493628024275 , -0.0482404449771698 , -0.08718650815999045 , -0.0014764704694506415 , -0.07426336448916614 , -0.10378029666564882 , 0.008572087846793842 , -0.00017173413848283343 , 0.010058893270893113 , 0.0028410498666004377 , 0.002008290211806285 , 0.011905375389931099 , 0.006071375802943992 , -0.0 ], [ 0.0076080165949142685 , -0.0017127333725310495 , 0.00153128150106188 , 0.0033391793764531563 , 0.005373442509691564 , 0.007207746020295443 , 0.007422946703693544 , -0.00699779191449194 , 0.002395328253696969 , -0.011682618874195954 , -0.012737004464649057 , -0.05379966383523857 , -0.07174960461749053 , -0.03027341304050314 , 0.0019411862216381327 , -0.0205575129473766 , -0.04617091711614171 , -0.017655308106959804 , -0.009297162816368814 , -0.03358572117988279 , -0.1626068444778013 , -0.015874364762085157 , -0.0013736074085577258 , -0.014763439328689378 , 0.00631805792697278 , 0.0021769414283267273 , 0.0023061635006792498 , -0.0 ], [ 0.005569931813561535 , 0.004363218328087518 , 0.00025609463218383973 , 0.009577483244680675 , 0.007257755916229399 , 0.00976284778532342 , -0.006388840235419147 , -0.009017880790555707 , -0.015308709334434867 , -0.016743935775597355 , -0.04372596546189275 , -0.03523469356755156 , -0.017257810114846107 , 0.011960489902313411 , 0.01529079831828911 , -0.020076559119468443 , -0.042792547669901516 , -0.0029492027218867116 , -0.011109560582516062 , -0.12985858077848939 , -0.2262858575494602 , -0.003391725540087574 , -0.03063368684328981 , -0.01353486587575121 , 0.0011140822443932317 , 0.006583451102528798 , 0.005667533945285076 , -0.0 ], [ 0.004056272267155598 , -0.0006394041203204911 , 0.004664893926197093 , 0.010593032387298614 , 0.014750931538689989 , 0.015428721146282149 , 0.012167820222401367 , 0.017604752451202518 , 0.01038886849969188 , 0.020544326931163263 , -0.0004206566917812794 , -0.0037463581359232674 , -0.0024656693040735075 , 0.0026061897697624353 , -0.05186055271869177 , -0.09158655048397382 , 0.022976389912563913 , -0.19851635458461808 , -0.11801281807622972 , -0.29127727790584423 , -0.017138655663803876 , -0.04395515676468641 , -0.019241432506341576 , 0.0011342298743447392 , 0.0030625771422964584 , -0.0002867924892991192 , -0.0017908808807543712 , -0.0 ], [ 0.0030114260660488892 , 0.0020246448273580006 , -0.003293361220376816 , 0.0036965043883218584 , 0.00013185761728146236 , -0.004355610866966878 , -0.006432601921104354 , -0.004148701459814858 , 0.005974553907915845 , -0.0001399233607281906 , 0.010392944122965082 , 0.015693249298693028 , 0.0459528427528407 , -0.013921539948093455 , -0.06615556518538708 , 0.02921438991320325 , -0.16345220625101778 , -0.002130491295590408 , -0.11449749664916867 , -0.030980255589300607 , -0.04804122537359171 , -0.05144994776295644 , 0.005122827412776085 , 0.006464862173908011 , 0.008624278272940246 , 0.0037316228508156427 , 0.0036947794337026706 , -0.0 ], [ 0.0038173843228389405 , -0.0017091931226819494 , -0.0030871869816778068 , 0.002115642501535999 , -0.006926441921580917 , -0.003023077828426468 , -0.014451359520861637 , -0.0020793048380231397 , -0.010948003939342523 , -0.0014460716966395166 , -0.01656990336897737 , 0.003052317148320358 , -0.0026729564809943513 , -0.06360067057346147 , 0.07780985635080599 , -0.1436689936630281 , -0.040817177623437874 , -0.04373367754296477 , -0.18337299150349698 , 0.025295182977407064 , -0.03874921104331938 , -0.002353901742617205 , 0.011772560401335033 , 0.012480994515707569 , 0.006498422579824301 , 0.00632320984076023 , 0.003407169765754805 , -0.0 ], [ 0.00944355257990139 , 0.009242583578688485 , 0.005069860444386138 , 0.012666191449103024 , 0.00941789912565746 , 0.004720427012836104 , 0.007597687789204113 , 0.008679266528089945 , 0.00889322771021875 , -0.0008577904940828809 , 0.0022973860384607604 , 0.025328230809207493 , -0.09908781123080951 , -0.07836626399832172 , -0.1546141264726177 , -0.2582207272050766 , -0.2297524599578219 , -0.29561835103416967 , 0.12048787956671528 , -0.06279365699861471 , -0.03832012404275233 , 0.022910264999199934 , 0.005803508497672737 , -0.003858461926053348 , 0.0039451232171312765 , 0.003858476747495933 , 0.0013034515558609956 , -0.0 ], [ 0.009725756015628606 , -0.0004001101998876524 , 0.006490722835571152 , 0.00800808023631959 , 0.0065880711806331265 , -0.0010264326176194034 , -0.0018914305972878344 , -0.008822522194658438 , -0.016650520788128117 , -0.03254382594389507 , -0.014795713101569494 , -0.05826499837818885 , -0.05165369567511702 , -0.13384277337594377 , -0.22572641373340493 , -0.21584739544668635 , -0.2366836351939208 , 0.14937824076489659 , -0.08127414932170171 , -0.06720440139736879 , -0.0038552732903526744 , 0.0107597891707803 , -5.67453590118174e-05 , 0.0020161340511396244 , -0.000783322694907436 , -0.0006397207517995289 , -0.005291639205010064 , -0.0 ], [ 0.008627543242777584 , 0.007700097300051849 , 0.0020430960246806138 , 0.012949015733198586 , 0.008428709579953574 , 0.001358177022953576 , 0.00421863939925833 , 0.002657580000868709 , -0.007339431957237175 , 0.02008439775442315 , -0.0033717631758033114 , -0.05176633249899187 , -0.013790328758662772 , -0.39102366157050594 , -0.167341447585844 , -0.04813367828213947 , 0.1367781582239039 , -0.04672809260566293 , -0.03237784669978756 , 0.03218068777925178 , 0.02415063765016493 , -0.017849899351200002 , -0.002975675228088795 , -0.004819438014786686 , 0.005106898651831245 , 0.0024278620704227456 , 6.784303333368138e-05 , -0.0 ], [ 0.009644258527009343 , -0.001331907219439711 , -0.0014639718434477777 , 0.008481926798958248 , 0.010278031715467508 , 0.003625808326891529 , -0.01121188617599796 , -0.0010634587872994379 , -0.0002603820881968461 , -0.017985648016990465 , -0.06446652745470374 , 0.07726063173046191 , -0.24739929795334742 , -0.2701855018480216 , -0.08888614776216278 , 0.1373325760136816 , -0.02316068912438066 , -0.042164834956711514 , 0.0009266091344106458 , 0.03141872420427644 , 0.011587728430225652 , 0.0004755143243520787 , 0.005860642609620605 , 0.008979633931394438 , 0.005061734169974005 , 0.003932710387086098 , 0.0015489986106803626 , -0.0 ], [ 0.010998736164377534 , 0.009378969800902604 , 0.00030577045264713074 , 0.0159329353530375 , 0.014849508018911006 , -0.0026513365659554225 , 0.002923303082126996 , 0.01917908707828847 , -0.02338288107991566 , -0.05706674679291175 , 0.009526265752669624 , -0.19945255386401284 , -0.10725519695909647 , -0.3222906835083537 , -0.03857038318412844 , -0.013279804965996065 , -0.046626023244262085 , -0.029299060237210447 , -0.043269580558906555 , -0.03768510002290657 , -0.02255977771908117 , -0.02632588166863199 , -0.014417349488098566 , -0.003077271951572957 , -0.0004973277708010661 , 0.0003475839139671271 , -0.0014522783025903258 , -0.0 ], [ 0.012215315671616316 , -0.001693194176229889 , 0.011365785434529038 , 0.0036964574178487792 , -0.010126738168635003 , -0.025554378647710443 , 0.006538003839811914 , -0.03181759044467965 , -0.016424751042854728 , 0.06177539736110035 , -0.43801735323216856 , -0.29991040815937386 , -0.2516019795363623 , 0.037789523540809 , -0.010948746374759491 , -0.0633901687126727 , -0.005976006160777705 , 0.006035133605976937 , -0.04961632526071937 , -0.04142116972831476 , -0.07558952727782252 , -0.04165176179187153 , -0.02021603856619006 , -0.0027365663096057032 , -0.011145473712733575 , 0.0003566937349350848 , -0.00546472985268321 , -0.0 ], [ 0.008009386447317503 , 0.006831207743885825 , 0.0051306149795546365 , 0.016239014770865052 , 0.020925441734273218 , 0.028344800173195076 , -0.004805080609285047 , -0.01880521614501033 , -0.1272329010865855 , -0.39835936819190537 , -0.09113694760349819 , -0.04061591094832608 , -0.12677021961235907 , 0.015567707226741051 , -0.005615051546243333 , -0.06454044862001587 , 0.0195457674752272 , -0.04219686517155871 , -0.08060569979524296 , 0.027234494361702787 , -0.009152881336047056 , -0.030865118003992217 , -0.005770311060090559 , 0.002905833371986098 , 5.606663556872091e-05 , 0.003209538083839772 , -0.0018588810743365345 , -0.0 ], [ 0.007587008852984699 , -0.0021213639853557625 , 0.0007709558092903736 , 0.013883256128746423 , 0.017328713012428214 , 0.03645357525636198 , -0.04043993335238427 , 0.05730125171252314 , -0.2563293727512057 , -0.11438826083879326 , 0.02662382809034687 , 0.03525271352483709 , 0.04745678120172762 , 0.0336360484090392 , -0.002916635707204059 , -0.17950855098650784 , -0.44161773297052964 , -0.4512180227831197 , -0.4940283106297913 , -0.1970108671285798 , 0.04344323143078066 , -0.012005120444897523 , 0.00987576109166055 , -0.0018336757466252476 , 0.0004913959502151706 , -0.0005409724034216215 , -0.005039223900868212 , -0.0 ], [ 0.00637876531169957 , 0.005189469227685454 , 0.0007676355246000376 , 0.018378100865097655 , 0.015739815031394887 , -0.035524983116512455 , 0.03781006978038308 , 0.28859052096740495 , 0.0726464110153121 , -0.026768468497420147 , 0.06278766200288134 , 0.17897045813699355 , -0.13780371920803108 , -0.14176458123649577 , -0.1733103177731656 , -0.3106508869296763 , 0.04788355140275794 , 0.04235327890285105 , -0.031266625292514394 , -0.016263819217960652 , -0.031388328800811355 , -0.01791363975905968 , -0.012025067979443894 , 0.008335083985905805 , -0.0014386677797296231 , 0.0055376544652972854 , 0.002241522815466253 , -0.0 ], [ 0.007455256326741617 , -0.0009475207572210404 , 0.0020288385162615286 , 0.015399640135796092 , 0.021133843188103074 , -0.019846405097622234 , -0.003162485751163173 , -0.14199005055318842 , -0.044200898667146035 , -0.013395459413208084 , 0.11019680479230103 , -0.014057216041764874 , -0.12553853334447865 , -0.05992513534766256 , 0.06467942189539834 , 0.08866056095907732 , -0.1451321508061849 , -0.07382491447758655 , -0.046961739981080476 , 0.0008943713493160624 , 0.03231044103656507 , 0.00036034241706501196 , -0.011387669277619417 , -0.00014602449257226195 , -0.0021863729003374116 , 0.0018817840156005856 , 0.0037909804578166286 , -0.0 ], [ 0.006511855618626698 , 0.006236866054439829 , -0.001440571166157676 , 0.012795776609942026 , 0.011530545030403624 , 0.03495489377257363 , 0.04792403136095304 , 0.049378583599065225 , 0.03296101702085617 , -0.0005351385876652296 , 0.017744115897640366 , 0.0011656622496764954 , 0.0232845869823761 , -0.0561191397060232 , -0.02854070511118366 , -0.028614174047247348 , -0.007763531086362863 , 0.01823079560098924 , 0.021961392405283622 , -0.009666681805706179 , 0.009547046884328725 , -0.008729943263791338 , 0.006408909680578429 , 0.009794327096359952 , -0.0025825219195515304 , 0.007063559189211571 , 0.007867244119267047 , -0.0 ], [ 0.007936663546039311 , -0.00010710180170593153 , 0.002716512705673228 , 0.0038633557307721487 , -0.0014877316616940372 , -0.0004788143065635909 , 0.012508842248031202 , 0.0045381104608414645 , -0.010650910516128294 , -0.013785341529644855 , -0.034287643221318206 , -0.022152707546335495 , -0.047056481347685974 , -0.032166744564720455 , -0.021551611335278546 , -0.002174962503376043 , 0.024344287130424306 , 0.015579272560525105 , 0.010958169741952194 , -0.010607232913436921 , -0.005548369726118836 , -0.0014630046444242706 , 0.013144180105016433 , 0.0031349366359021916 , 0.0010984887428255974 , 0.005426941473328394 , 0.006566511860044785 , -0.0 ], [ 0.0005529184874606495 , 0.00026139355020588705 , -0.002887623443531047 , 0.0013988462990850632 , 0.00203365139495493 , -0.007276926701775218 , -0.004010419939595932 , 0.017521952161185662 , 0.0006996977433557911 , 0.02083134683611201 , 0.013690533534289498 , -0.005466724359976675 , -0.008857712321334327 , 0.017408578822635818 , 0.0076439343049154425 , 0.0017861314923539985 , 0.007465865707523924 , 0.008034420825988495 , 0.003976298558337994 , 0.00411970637898539 , -0.004572592545819698 , 0.0029563907011979935 , -0.0006382227820088148 , 0.0015153753877889707 , -0.0052626601797995595 , 0.0025664706985019416 , 0.005161751034260073 , -0.0 ], [ 0.0009424280561998445 , -0.0012942360298110595 , 0.0011900868416523343 , 0.000984424113178899 , 0.0020988269382781564 , -0.005870080062890889 , -0.004950484744457169 , 0.003117643454332697 , -0.002509563565777083 , 0.005831604884101081 , 0.009531085216183116 , 0.010030206821909806 , 0.005858190171099734 , 4.9344529936340524e-05 , -0.004027895832421331 , 0.0025436439920587606 , 0.00531153867563076 , 0.00495942692369508 , 0.009215148318606382 , 0.00010011928317543458 , 0.0060051362999805355 , -0.0008195376963202741 , 0.0041728603512658224 , -0.0017597169567888774 , -0.0010577007775543158 , 0.00046033327178068433 , -0.0007674196306044449 , -0.0 ], [ -0.0 , -0.0 , 0.0013386963856532302 , 0.00035183178922260837 , 0.0030610334903526204 , 8.951834979315781e-05 , 0.0023676793550483524 , -0.0002900551076915047 , -0.00207019445286608 , -7.61697478482574e-05 , 0.0012150086715244216 , 0.009831239281792168 , 0.003479667642621962 , 0.0070584324334114525 , 0.004161851261339585 , 0.0026146296354490665 , -9.194746959222099e-05 , 0.0013583866966571571 , 0.0016821551239318913 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 ]]]]}","title":"Model Explanation"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-v1-grpc-protocol","text":"Note : Since kserve has no grpc client methods for v1, we are using torchserve's grpc v1 client","title":"Deploy PyTorch Model with V1 gRPC Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice","text":"For deploying the InferenceService with gRPC protocol you need to expose the gRPC port on InferenceService. Here 7070 is torchserve gRPC port. Apply the following mnist_grpc.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc.yaml Expected Output inferenceservice.serving.kserve.io/torchserve-grpc created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#run-inference-with-torchserve-grpc-protocol","text":"Install gRPC python dependencies: pip install -U grpcio protobuf grpcio-tools Download TorchServe's inference and management proto: mkdir -p proto/v1 INFERENCE_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/inference.proto MANAGEMENT_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/management.proto curl -s -L ${ INFERENCE_PROTO_FILE_PATH } > ./proto/v1/inference.proto curl -s -L ${ MANAGEMENT_PROTO_FILE_PATH } > ./proto/v1/management.proto Generate python gRPC client stub using the proto files: python -m grpc_tools.protoc --proto_path = proto/v1/ --python_out = . --grpc_python_out = . proto/v1/inference.proto proto/v1/management.proto You can use image converter to convert the images to base64 byte array, for other models please refer to input request . Run gRPC Inference using torchserve_grpc_client.py with mnist.json as an example prediction input. MODEL_NAME = mnist INPUT_PATH = mnist.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python torchserve_grpc_client.py --api_name infer --model $MODEL_NAME --input_path $INPUT_PATH --hostname $SERVICE_HOSTNAME Expected Output { \"predictions\" : [ 2 ] }","title":"Run Inference with TorchServe gRPC protocol"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-open-inference-rest-protocol","text":"","title":"Deploy PyTorch model with Open Inference REST Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice_1","text":"KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec and enables the KServe v1 inference protocol. To enable v2 open inference protocol, specify the protocolVersion field with the value v2 . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : pytorch : protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 For deploying the model on CPU, apply the mnist_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_v2.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve-mnist-v2 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-mnist-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can send both byte array and tensor with v2 protocol, for byte array use image converter to convert the image to byte array input. Here we use the mnist_v2_bytes.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2_bytes.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} For tensor input use the tensor image converter to convert the image to tensor input and here we use the mnist_v2.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2.json Expected Output { \"id\" : \"2266ec1e-f600-40af-97b5-7429b8195a80\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]}","title":"Model Inference"},{"location":"modelserving/v1beta1/torchserve/#model-explanation_1","text":"To get the model explanation with v2 explain endpoint: MODEL_NAME = mnist curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mnist/explain -d @./mnist_v2.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"explain\" , \"shape\" : [ 1 , 28 , 28 ], \"datatype\" : \"FP64\" , \"data\" : [ -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0040547528781588035 , -0.00022612877200043775 , -0.0001273413606783097 , 0.005648369508785856 , 0.008904784451506994 , 0.0026385365879584796 , 0.0026802458602499875 , -0.002657801604900743 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.00024465772895309256 , 0.0008218449738666515 , 0.015285917610467934 , 0.007512832227517626 , 0.007094984753782517 , 0.003405668751094489 , -0.0020919252360163056 , -0.00078002938659872 , 0.02299587777864007 , 0.01900432942654754 , -0.001252955497754338 , -0.0014666116894338772 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.005298396384926053 , -0.0007901605067151054 , 0.0039060659788228954 , 0.02317408211645009 , 0.017237917554858186 , 0.010867034286601965 , 0.003001563092717309 , 0.00622421762838887 , 0.006120712336480808 , 0.016736329175541464 , 0.005674718838256385 , 0.004344134814439431 , -0.001232842177319105 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0006867353660007012 , 0.00977289933298656 , -0.003875493166540815 , 0.0017986937404117591 , 0.0013075440157543057 , -0.0024510980461748236 , -0.0008806773426546923 , -0.0 , -0.0 , -0.00014277890422995419 , -0.009322313284511257 , 0.020608317953885236 , 0.004351394739722548 , -0.0007875565409186222 , -0.0009075897751127677 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.00022247237111456804 , -0.0007829031603535926 , 0.002666369539125161 , 0.000973336852105775 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.000432321003928822 , 0.023657172129172684 , 0.010694844898905204 , -0.002375952975746018 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0020747972047037 , -0.002320101258915877 , -0.0012899205783904548 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.007629679655402933 , 0.01044862724376463 , 0.00025032878924736025 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.00037708370104137974 , -0.005156369275302328 , 0.0012477582442296628 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -4.442516083381132e-05 , 0.01024804634283815 , 0.0009971135240970147 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0004501048968956462 , -0.0019630535686311007 , -0.0006664793297549408 , 0.0020157403539278907 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0022144569383238466 , 0.008361583574785395 , 0.00314019428604999 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0028943544591141838 , -0.0031301383432286406 , 0.002113252872926688 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0010321050605717045 , 0.008905753926369048 , 0.002846438277738756 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.005305288883499087 , -0.00192711009725932 , 0.0012090042768467344 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0011945156500241256 , 0.005654442715832439 , 0.0020132075345016807 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0014689356969061985 , 0.0010743412638183228 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0017047980586912376 , 0.00290660517425009 , -0.0007805869640505143 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 5.541 * Co nne c t io n # 0 t o hos t localhos t le ft i nta c t 725422148614e-05 , 0.0014516114512869852 , 0.0002827701966546988 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0014401407633627265 , 0.0023812497776698745 , 0.002146825301700187 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0011500529125940918 , 0.0002865015572973405 , 0.0029798151042282686 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0017750295500283872 , 0.0008339859126060243 , -0.00377073933577687 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0006093176894575109 , -0.00046905787892409935 , 0.0034053218511795034 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0007450011768391558 , 0.001298767372877851 , -0.008499247640112315 , -6.145166131400234e-05 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0011809726042792137 , -0.001838476328106708 , 0.00541110661116898 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.002139234224224006 , 0.0003259163407641124 , -0.005276118873855287 , -0.001950984007438105 , -9.545670742026532e-07 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0007772404228681039 , -0.0001517956264720738 , 0.0064814848131711815 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 8.098064985902114e-05 , -0.00249042660692983 , -0.0020718619200672302 , -5.341117902942147e-05 , -0.00045564724429915073 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0022750983476959733 , 0.0017164060958460778 , 0.0003221344707738082 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0015560282678744543 , 9.107238495871273e-05 , 0.0008772841497928399 , 0.0006502978626355868 , -0.004128780767525651 , 0.0006030386900152659 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.001395995791096219 , 0.0026791526689584344 , 0.0023995008266391488 , -0.0004496096312746451 , 0.003101832450753724 , 0.007494536066960778 , 0.0028641187148287965 , -0.0030525907182629075 , 0.003420222396518567 , 0.0014924018363498125 , -0.0009357388301326025 , 0.0007856228933169799 , -0.0018433973914981437 , 1.6031856831240914e-05 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0006999018502034005 , 0.004382250870697946 , -0.0035419313267119365 , -0.0028896748092595375 , -0.00048734542493666705 , -0.0060873452419295 , 0.000388224990424471 , 0.002533641537585585 , -0.004352836563597573 , -0.0006079418766875505 , -0.0038101334053377753 , -0.000828441340357984 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0010901530866342661 , -0.013135008038845744 , 0.0004734518707654666 , 0.002050423283568135 , -0.006609451922460863 , 0.0023647861820124366 , 0.0046789204256194 , -0.0018122527412311837 , 0.002137538353955849 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ]}]}","title":"Model Explanation"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-open-inference-grpc-protocol","text":"","title":"Deploy PyTorch Model with Open Inference gRPC Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice_2","text":"For deploying the InferenceService with Open Inference gRPC Protocol you need to expose the gRPC port on InferenceService. Here 8081 is kserve gRPC port. Apply the following mnist_grpc_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc_v2.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-grpc-v2 created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : pytorch : protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#run-grpc-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Then download Open Inference gRPC proto file: mkdir -p proto/v2 PROTO_FILE_PATH = https://raw.githubusercontent.com/kserve/kserve/master/python/kserve/kserve/protocol/grpc/grpc_predict_v2.proto curl -s -L ${ PROTO_FILE_PATH } > ./proto/v2/grpc_predict_v2.proto Run the inference test with grpcurl: INPUT_PATH = ./mnist_v2_grpc_tensor.json PROTO_FILE = proto/v2/grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) grpcurl -v -plaintext -proto ${ PROTO_FILE } -authority ${ SERVICE_HOSTNAME } -d @ ${ INGRESS_HOST } : ${ INGRESS_PORT } inference.GRPCInferenceService.ModelInfer <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Wed, 11 Oct 2023 13 :36:30 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 581 Response contents: { \"modelName\" : \"mnist\" , \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"outputs\" : [ { \"name\" : \"input-0\" , \"datatype\" : \"INT64\" , \"shape\" : [ \"1\" ] , \"contents\" : { \"int64Contents\" : [ \"1\" ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Run gRPC Inference"},{"location":"modelserving/v1beta1/torchserve/#autoscaling","text":"One of the main serverless inference features is to automatically scale the replicas of an InferenceService matching the incoming workload. KServe by default enables Knative Pod Autoscaler which watches traffic flow and scales up and down based on the configured metrics.","title":"Autoscaling"},{"location":"modelserving/v1beta1/torchserve/#knative-autoscaler","text":"KServe supports the implementation of Knative Pod Autoscaler (KPA) and Kubernetes\u2019 Horizontal Pod Autoscaler (HPA) . The features and limitations of each of these Autoscalers are listed below. Note If you want to use Kubernetes Horizontal Pod Autoscaler (HPA), you must install HPA extension Knative Pod Autoscaler (KPA) Part of the Knative Serving core and enabled by default once Knative Serving is installed. Supports scale to zero functionality. Does not support CPU-based autoscaling. Horizontal Pod Autoscaler (HPA) Not part of the Knative Serving core, and must be enabled after Knative Serving installation. Does not support scale to zero functionality. Supports CPU-based autoscaling.","title":"Knative Autoscaler"},{"location":"modelserving/v1beta1/torchserve/#create-inferenceservice-with-concurrency-target","text":"","title":"Create InferenceService with Concurrency Target"},{"location":"modelserving/v1beta1/torchserve/#hardsoft-autoscaling-limit","text":"You can configure InferenceService with annotation autoscaling.knative.dev/target for a soft limit. The soft limit is a targeted limit rather than a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" You can also configure InferenceService with field containerConcurrency with a hard limit. The hard limit is an enforced upper bound. If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" After specifying the soft or hard limits of the scaling target, you can now deploy the InferenceService with autoscaling.yaml . kubectl kubectl apply -f autoscaling.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created","title":"Hard/Soft Autoscaling Limit"},{"location":"modelserving/v1beta1/torchserve/#run-inference-with-concurrent-requests","text":"The first step is to install the hey load generator and then send the concurrent requests to the InferenceService . go get -u github.com/rakyll/hey MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -m POST -z 30s -D ./mnist.json -host ${ SERVICE_HOSTNAME } http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict","title":"Run Inference with Concurrent Requests"},{"location":"modelserving/v1beta1/torchserve/#check-pod-autoscaling","text":"hey by default generates 50 requests concurrently, so you can see that the InferenceService scales to 5 pods as the container concurrency target is set to 10. Expected Output NAME READY STATUS RESTARTS AGE torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb 2 /2 Terminating 0 103s torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8 2 /2 Terminating 0 95s torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq 2 /2 Running 0 50m torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2 /2 Running 0 113s torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2 /2 Running 0 109s torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t 2 /2 Terminating 0 103s","title":"Check Pod Autoscaling"},{"location":"modelserving/v1beta1/torchserve/#canary-rollout","text":"Canary rollout is a deployment strategy when you release a new version of model to a small percent of the production traffic.","title":"Canary Rollout"},{"location":"modelserving/v1beta1/torchserve/#create-inferenceservice-with-canary-model","text":"After the above experiments, now let's see how you can rollout a new model without moving full traffic to the new model by default. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" In this example we change the storageUri to the v2 version with canaryTrafficPercent field and then apply the canary.yaml . kubectl kubectl apply -f canary.yaml Expected Output kubectl get revisions -l serving.kserve.io/inferenceservice = torchserve NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON ACTUAL REPLICAS DESIRED REPLICAS torchserve-predictor-default-00001 torchserve-predictor-default 1 True 1 1 torchserve-predictor-default-00002 torchserve-predictor-default 2 True 1 1 kubectl get pods -l serving.kserve.io/inferenceservice = torchserve NAME READY STATUS RESTARTS AGE torchserve-predictor-default-00001-deployment-7d99979c99-p49gk 2 /2 Running 0 28m torchserve-predictor-default-00002-deployment-c6fcc65dd-rjknq 2 /2 Running 0 3m37s","title":"Create InferenceService with Canary Model"},{"location":"modelserving/v1beta1/torchserve/#check-traffic-status","text":"After the canary model is rolled out, the traffic should be split between the canary model revision and the \"stable\" revision which was rolled out with 100% percent traffic, now check the traffic split from the InferenceService traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 20 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 80 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Check Traffic Status"},{"location":"modelserving/v1beta1/torchserve/#traffic-rollout","text":"Run the following curl requests a few times to the InferenceService , you can see that requests are sent to the two revisions with 20/80 splits. MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) for i in { 1 ..10 } ; do curl -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json ; done Expected Output { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 <html><title>500: Internal Server Error</title><body>500: Internal Server Error</body></html>Handling connection for 8080 <html><title>500: Internal Server Error</title><body>500: Internal Server Error</body></html>Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 You can notice that when the request hits the canary revision it fails, this is because that the new revision requires the v2 inference input mnist_v2.json which is a breaking change, in addition the traffic is randomly splitted between the two revisions according to the specified traffic percentage. In this case you should rollout the canary model with 0 canaryTrafficPercent and use the latest tagged url to test the canary model before moving the full traffic to the new model. kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' curl -v -H \"Host: latest-torchserve-predictor-default.default.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [ 1 ], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} After the new model is tested and verified, you can now bump the canaryTrafficPercent to 100 to fully rollout the traffic to the new revision and now the latestRolledoutRevision becomes torchserve-predictor-default-00002 and previousRolledoutRevision becomes torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 100}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00002\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Traffic Rollout"},{"location":"modelserving/v1beta1/torchserve/#rollback-the-model","text":"In case the new model version does not work after the traffic is moved to the new revision, you can still patch the canaryTrafficPercent to 0 and move the traffic back to the previously rolled model which is torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 0 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Rollback the Model"},{"location":"modelserving/v1beta1/torchserve/#monitoring","text":"Metrics Exposure and Grafana Dashboard Setup","title":"Monitoring"},{"location":"modelserving/v1beta1/torchserve/bert/","text":"TorchServe example with Huggingface bert model \u00b6 In this example we will show how to serve Huggingface Transformers with TorchServe on KServe. Model archive file creation \u00b6 Clone pytorch/serve repository, navigate to examples/Huggingface_Transformers and follow the steps for creating the MAR file including serialized model and other dependent files. TorchServe supports both eager model and torchscript and here we save as the pretrained model. torch-model-archiver --model-name BERTSeqClassification --version 1 .0 \\ --serialized-file Transformer_model/pytorch_model.bin \\ --handler ./Transformer_handler_generalized.py \\ --extra-files \"Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json\" Create the InferenceService \u00b6 Apply the CRD kubectl apply -f bert.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-bert created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:predict -d ./sample_text.txt Expected Output * Trying 44 .239.20.204... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 44 .239.20.204 ) port 80 ( #0) > PUT /v1/models/BERTSeqClassification:predict HTTP/1.1 > Host: torchserve-bert.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 79 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 8 < date: Wed, 04 Nov 2020 10 :54:49 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 4b54d3ac-185f-444c-b344-b8a785fdeb50 < x-envoy-upstream-service-time: 2085 < server: istio-envoy < * Connection #0 to host torchserve-bert.kserve-test.example.com left intact Accepted Captum Explanations \u00b6 In order to understand the word importances and attributions when we make an explanation Request, we use Captum Insights for the Hugginface Transformers pre-trained model. MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:explaine -d ./sample_text.txt Expected output * Trying ::1:8080... * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/BERTSeqClassification:explain HTTP/1.1 > Host: torchserve-bert.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded >Handling connection for 8080 * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 292 < content-type: application/json ; charset = UTF-8 < date: Sun, 27 Dec 2020 05 :53:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 5769 < * Connection #0 to host localhost left intact { \"explanations\" : [{ \"importances\" : [ 0 .0, -0.6324463574494716, -0.033115653530477414, 0 .2681695752722339, -0.29124745608778546, 0 .5422589681903883, -0.3848768219546909, 0 .0 ] , \"words\" : [ \"[CLS]\" , \"bloomberg\" , \"has\" , \"reported\" , \"on\" , \"the\" , \"economy\" , \"[SEP]\" ] , \"delta\" : -0.0007350619859377225 }]}","title":"TorchServe example with Huggingface bert model"},{"location":"modelserving/v1beta1/torchserve/bert/#torchserve-example-with-huggingface-bert-model","text":"In this example we will show how to serve Huggingface Transformers with TorchServe on KServe.","title":"TorchServe example with Huggingface bert model"},{"location":"modelserving/v1beta1/torchserve/bert/#model-archive-file-creation","text":"Clone pytorch/serve repository, navigate to examples/Huggingface_Transformers and follow the steps for creating the MAR file including serialized model and other dependent files. TorchServe supports both eager model and torchscript and here we save as the pretrained model. torch-model-archiver --model-name BERTSeqClassification --version 1 .0 \\ --serialized-file Transformer_model/pytorch_model.bin \\ --handler ./Transformer_handler_generalized.py \\ --extra-files \"Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json\"","title":"Model archive file creation"},{"location":"modelserving/v1beta1/torchserve/bert/#create-the-inferenceservice","text":"Apply the CRD kubectl apply -f bert.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-bert created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/bert/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:predict -d ./sample_text.txt Expected Output * Trying 44 .239.20.204... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 44 .239.20.204 ) port 80 ( #0) > PUT /v1/models/BERTSeqClassification:predict HTTP/1.1 > Host: torchserve-bert.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 79 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 8 < date: Wed, 04 Nov 2020 10 :54:49 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 4b54d3ac-185f-444c-b344-b8a785fdeb50 < x-envoy-upstream-service-time: 2085 < server: istio-envoy < * Connection #0 to host torchserve-bert.kserve-test.example.com left intact Accepted","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/bert/#captum-explanations","text":"In order to understand the word importances and attributions when we make an explanation Request, we use Captum Insights for the Hugginface Transformers pre-trained model. MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:explaine -d ./sample_text.txt Expected output * Trying ::1:8080... * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/BERTSeqClassification:explain HTTP/1.1 > Host: torchserve-bert.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded >Handling connection for 8080 * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 292 < content-type: application/json ; charset = UTF-8 < date: Sun, 27 Dec 2020 05 :53:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 5769 < * Connection #0 to host localhost left intact { \"explanations\" : [{ \"importances\" : [ 0 .0, -0.6324463574494716, -0.033115653530477414, 0 .2681695752722339, -0.29124745608778546, 0 .5422589681903883, -0.3848768219546909, 0 .0 ] , \"words\" : [ \"[CLS]\" , \"bloomberg\" , \"has\" , \"reported\" , \"on\" , \"the\" , \"economy\" , \"[SEP]\" ] , \"delta\" : -0.0007350619859377225 }]}","title":"Captum Explanations"},{"location":"modelserving/v1beta1/torchserve/metrics/","text":"Expose TorchServe Metrics \u00b6 This tutorial setups prometheus and granfana to the cluster with TorchServe metrics. Install Istio with Grafana and Prometheus \u00b6 Note: Make sure to enable prometheus and grafana while installing istio. After installation Grafana and Prometheus can be accessed from the below links # Grafana istioctl dashboard grafana # Prometheus istioctl dashboard prometheus Create the InferenceService \u00b6 Enable prometheus scraping by adding annotations to deployment yaml, by default the torchserve's metrics port is 8082. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 Apply the CRD kubectl apply -f metrics.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-metrics created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torch-metrics <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist:predict HTTP/1.1 > Host: torch-metrics.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 272 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Fri, 23 Oct 2020 13 :01:09 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 8881f2b9-462e-4e2d-972f-90b4eb083e53 < x-envoy-upstream-service-time: 5018 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]} Check the dashboard \u00b6 Prometheus graph view \u00b6 Navigate to prometheus page Add a query in the prometheus page Grafana dashboard \u00b6 Navigate to grafana page Add a dashboard from the top left + symbol Click add query and enter the query For Exposing grafana and prometheus under istio ingress please refer to remotely accessing telemetry addons Apply below deployment for a demo setup. apiVersion : networking.istio.io/v1alpha3 kind : Gateway metadata : name : grafana-gateway namespace : istio-system spec : selector : istio : ingressgateway servers : - port : number : 80 name : http-grafana protocol : HTTP hosts : - \"grafana.example.com\" --- apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : grafana-vs namespace : istio-system spec : hosts : - \"grafana.example.com\" gateways : - grafana-gateway http : - route : - destination : host : grafana port : number : 3000 --- apiVersion : networking.istio.io/v1alpha3 kind : DestinationRule metadata : name : grafana namespace : istio-system spec : host : grafana trafficPolicy : tls : mode : DISABLE --- All request with hostname grafana.example.com redirects to grafana.","title":"Expose TorchServe Metrics"},{"location":"modelserving/v1beta1/torchserve/metrics/#expose-torchserve-metrics","text":"This tutorial setups prometheus and granfana to the cluster with TorchServe metrics.","title":"Expose TorchServe Metrics"},{"location":"modelserving/v1beta1/torchserve/metrics/#install-istio-with-grafana-and-prometheus","text":"Note: Make sure to enable prometheus and grafana while installing istio. After installation Grafana and Prometheus can be accessed from the below links # Grafana istioctl dashboard grafana # Prometheus istioctl dashboard prometheus","title":"Install Istio with Grafana and Prometheus"},{"location":"modelserving/v1beta1/torchserve/metrics/#create-the-inferenceservice","text":"Enable prometheus scraping by adding annotations to deployment yaml, by default the torchserve's metrics port is 8082. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 Apply the CRD kubectl apply -f metrics.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-metrics created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/metrics/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torch-metrics <namespace> -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist:predict HTTP/1.1 > Host: torch-metrics.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 272 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Fri, 23 Oct 2020 13 :01:09 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 8881f2b9-462e-4e2d-972f-90b4eb083e53 < x-envoy-upstream-service-time: 5018 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/metrics/#check-the-dashboard","text":"","title":"Check the dashboard"},{"location":"modelserving/v1beta1/torchserve/metrics/#prometheus-graph-view","text":"Navigate to prometheus page Add a query in the prometheus page","title":"Prometheus graph view"},{"location":"modelserving/v1beta1/torchserve/metrics/#grafana-dashboard","text":"Navigate to grafana page Add a dashboard from the top left + symbol Click add query and enter the query For Exposing grafana and prometheus under istio ingress please refer to remotely accessing telemetry addons Apply below deployment for a demo setup. apiVersion : networking.istio.io/v1alpha3 kind : Gateway metadata : name : grafana-gateway namespace : istio-system spec : selector : istio : ingressgateway servers : - port : number : 80 name : http-grafana protocol : HTTP hosts : - \"grafana.example.com\" --- apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : grafana-vs namespace : istio-system spec : hosts : - \"grafana.example.com\" gateways : - grafana-gateway http : - route : - destination : host : grafana port : number : 3000 --- apiVersion : networking.istio.io/v1alpha3 kind : DestinationRule metadata : name : grafana namespace : istio-system spec : host : grafana trafficPolicy : tls : mode : DISABLE --- All request with hostname grafana.example.com redirects to grafana.","title":"Grafana dashboard"},{"location":"modelserving/v1beta1/torchserve/model-archiver/","text":"Generate model archiver files for torchserve \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . 1. Create PV and PVC \u00b6 Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage you can refer to AWS EFS storage 1.1 Create PV \u00b6 Edit volume id in pv.yaml file kubectl apply -f pv.yaml Expected Output $ persistentvolume/model-pv-volume created 1.2 Create PVC \u00b6 kubectl apply -f pvc.yaml Expected Output $ persistentvolumeclaim/model-pv-claim created 2 Create model store files layout and copy to PV \u00b6 We create a pod with the PV attached to copy the model files and config.properties for generating model archive file. 2.1 Create pod for copying model store files to PV \u00b6 kubectl apply -f pvpod.yaml Expected Output $ pod/model-store-pod created 2.2 Create model store file layout on PV \u00b6 2.2.1 Create properties.json file \u00b6 This file has model-name, version, model-file name, serialized-file name, extra-files, handlers, workers etc. of the models. [ { \"model-name\" : \"mnist\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"mnist_cnn.pt\" , \"extra-files\" : \"\" , \"handler\" : \"mnist_handler.py\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" }, { \"model-name\" : \"densenet_161\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"densenet161-8d451a50.pth\" , \"extra-files\" : \"index_to_name.json\" , \"handler\" : \"image_classifier\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" } ] 2.2.2 Copy model and its dependent Files \u00b6 Copy all the model and dependent files to the PV in the structure given below. An empty config folder, a model-store folder containing model name as folder name. Within that model folder, the files required to build the marfile. \u251c\u2500\u2500 config \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161 \u2502 \u2502 \u251c\u2500\u2500 densenet161-8d451a50.pth \u2502 \u2502 \u251c\u2500\u2500 index_to_name.json \u2502 \u2502 \u2514\u2500\u2500 model.py \u2502 \u251c\u2500\u2500 mnist \u2502 \u2502 \u251c\u2500\u2500 mnist_cnn.pt \u2502 \u2502 \u251c\u2500\u2500 mnist_handler.py \u2502 \u2502 \u2514\u2500\u2500 mnist.py \u2502 \u2514\u2500\u2500 properties.json 2.2.3 Create folders for model-store and config in PV \u00b6 kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/model-store/ kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/config/ 2.3 Copy model files and config.properties to the PV \u00b6 kubectl cp model-store/* model-store-pod:/pv/model-store/ -c model-store -n kserve-test kubectl cp config.properties model-store-pod:/pv/config/ -c model-store -n kserve-test 2.4 Delete pv pod \u00b6 Since amazon EBS provide only ReadWriteOnce mode, we have to unbind the PV for use of model archiver. kubectl delete pod model-store-pod -n kserve-test 3 Generate model archive file and server configuration file \u00b6 3.1 Create model archive pod and run model archive file generation script \u00b6 kubectl apply -f model-archiver.yaml -n kserve-test 3.2 Check the output and delete model archive pod \u00b6 Verify mar files and config.properties kubectl exec -it margen-pod -n kserve-test -- ls -lR /home/model-server/model-store kubectl exec -it margen-pod -n kserve-test -- cat /home/model-server/config/config.properties 3.3 Delete model archiver \u00b6 kubectl delete -f model-archiver.yaml -n kserve-test","title":"Generate model archiver files for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#generate-model-archiver-files-for-torchserve","text":"","title":"Generate model archiver files for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#setup","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible .","title":"Setup"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#1-create-pv-and-pvc","text":"Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage you can refer to AWS EFS storage","title":"1. Create PV and PVC"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#11-create-pv","text":"Edit volume id in pv.yaml file kubectl apply -f pv.yaml Expected Output $ persistentvolume/model-pv-volume created","title":"1.1 Create PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#12-create-pvc","text":"kubectl apply -f pvc.yaml Expected Output $ persistentvolumeclaim/model-pv-claim created","title":"1.2 Create PVC"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#2-create-model-store-files-layout-and-copy-to-pv","text":"We create a pod with the PV attached to copy the model files and config.properties for generating model archive file.","title":"2 Create model store files layout and copy to PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#21-create-pod-for-copying-model-store-files-to-pv","text":"kubectl apply -f pvpod.yaml Expected Output $ pod/model-store-pod created","title":"2.1 Create pod for copying model store files to PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#22-create-model-store-file-layout-on-pv","text":"","title":"2.2 Create model store file layout on PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#221-create-propertiesjson-file","text":"This file has model-name, version, model-file name, serialized-file name, extra-files, handlers, workers etc. of the models. [ { \"model-name\" : \"mnist\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"mnist_cnn.pt\" , \"extra-files\" : \"\" , \"handler\" : \"mnist_handler.py\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" }, { \"model-name\" : \"densenet_161\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"densenet161-8d451a50.pth\" , \"extra-files\" : \"index_to_name.json\" , \"handler\" : \"image_classifier\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" } ]","title":"2.2.1 Create properties.json file"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#222-copy-model-and-its-dependent-files","text":"Copy all the model and dependent files to the PV in the structure given below. An empty config folder, a model-store folder containing model name as folder name. Within that model folder, the files required to build the marfile. \u251c\u2500\u2500 config \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161 \u2502 \u2502 \u251c\u2500\u2500 densenet161-8d451a50.pth \u2502 \u2502 \u251c\u2500\u2500 index_to_name.json \u2502 \u2502 \u2514\u2500\u2500 model.py \u2502 \u251c\u2500\u2500 mnist \u2502 \u2502 \u251c\u2500\u2500 mnist_cnn.pt \u2502 \u2502 \u251c\u2500\u2500 mnist_handler.py \u2502 \u2502 \u2514\u2500\u2500 mnist.py \u2502 \u2514\u2500\u2500 properties.json","title":"2.2.2 Copy model and its dependent Files"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#223-create-folders-for-model-store-and-config-in-pv","text":"kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/model-store/ kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/config/","title":"2.2.3 Create folders for model-store and config in PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#23-copy-model-files-and-configproperties-to-the-pv","text":"kubectl cp model-store/* model-store-pod:/pv/model-store/ -c model-store -n kserve-test kubectl cp config.properties model-store-pod:/pv/config/ -c model-store -n kserve-test","title":"2.3 Copy model files and config.properties to the PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#24-delete-pv-pod","text":"Since amazon EBS provide only ReadWriteOnce mode, we have to unbind the PV for use of model archiver. kubectl delete pod model-store-pod -n kserve-test","title":"2.4 Delete pv pod"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#3-generate-model-archive-file-and-server-configuration-file","text":"","title":"3 Generate model archive file and server configuration file"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#31-create-model-archive-pod-and-run-model-archive-file-generation-script","text":"kubectl apply -f model-archiver.yaml -n kserve-test","title":"3.1 Create model archive pod and run model archive file generation script"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#32-check-the-output-and-delete-model-archive-pod","text":"Verify mar files and config.properties kubectl exec -it margen-pod -n kserve-test -- ls -lR /home/model-server/model-store kubectl exec -it margen-pod -n kserve-test -- cat /home/model-server/config/config.properties","title":"3.2 Check the output and delete model archive pod"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#33-delete-model-archiver","text":"kubectl delete -f model-archiver.yaml -n kserve-test","title":"3.3 Delete model archiver"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/","text":"Model archiver for torchserve \u00b6 Steps: Modify config in entrypoint for default config (optional) Build docker image Push docker image to repo docker build --file Dockerfile -t margen:latest . docker tag margen:latest { username } /margen:latest docker push { username } /margen:latest","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/#model-archiver-for-torchserve","text":"Steps: Modify config in entrypoint for default config (optional) Build docker image Push docker image to repo docker build --file Dockerfile -t margen:latest . docker tag margen:latest { username } /margen:latest docker push { username } /margen:latest","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/","text":"Model archiver for torchserve \u00b6 Place all the file required to grenerate marfile in the model folder \u00b6","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/#model-archiver-for-torchserve","text":"","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/#place-all-the-file-required-to-grenerate-marfile-in-the-model-folder","text":"","title":"Place all the file required to grenerate marfile in the model folder"},{"location":"modelserving/v1beta1/transformer/collocation/","text":"Collocate transformer and predictor in same pod \u00b6 KServe by default deploys the Transformer and Predictor as separate services, allowing you to deploy them on different devices and scale them independently. Nevertheless, there are certain situations where you might prefer to collocate the transformer and predictor within the same pod. Here are a few scenarios: If your transformer is tightly coupled with the predictor and you want to perform canary deployment together. If you want to reduce sidecar resources. If you want to reduce networking latency. Before you begin \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Deploy the InferenceService \u00b6 Since, the predictor and the transformer are in the same pod, they need to listen on different ports to avoid conflict. Transformer is configured to listen on port 8080 (REST) and 8081 (GRPC) while, Predictor listens on port 8085 (REST). Transformer calls Predictor on port 8085 via local socket. Deploy the Inferenceservice using the below command. cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: custom-transformer-collocation spec: predictor: containers: - name: kserve-container # Do not change the name; This should be the predictor container image: \"pytorch/torchserve:0.9.0-cpu\" args: - \"torchserve\" - \"--start\" - \"--model-store=/mnt/models/model-store\" - \"--ts-config=/mnt/models/config/config.properties\" env: - name: TS_SERVICE_ENVELOPE value: kserve - name: STORAGE_URI # This will trigger storage initializer; Should be only present in predictor container value: \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi - name: transformer-container # Do not change the container name image: kserve/image-transformer:latest args: - --model_name=mnist - --protocol=v1 # protocol of the predictor; used for converting the input to specific protocol supported by the predictor - --http_port=8080 - --grpc_port=8081 - --predictor_host=localhost:8085 # predictor listening port ports: - containerPort: 8080 protocol: TCP resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi EOF Expected output $ inferenceservice.serving.kserve.io/custom-transformer-collocation created Warning Always use the transformer container name as transformer-container . Otherwise, the model volume is not mounted to the transformer container which may result in an error. Warning Always use the predictor container name as kserve-container . Kserve internally uses this name to find out the predictor. The storage uri should be only present in this container. If it is specified in the transformer container the isvc creation will fail. Note Currently, The collocation support is limited to the custom container spec for kserve model container. Note In Serverless mode, Specifying ports for predictor will result in isvc creation failure as specifying multiple ports is not supported by knative. Due to this limitation predictor cannot be exposed to the outside cluster. For more info see, knative discussion on multiple ports . Check InferenceService status \u00b6 kubectl get isvc custom-transformer-collocation Expected output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE custom-transformer-collocation http://custom-transformer-collocation.default.example.com True 100 custom-transformer-collocation-predictor-00001 133m Note If your DNS contains svc.cluster.local , then Inferenceservice is not exposed through Ingress. you need to configure DNS or use a custom domain in order to expose the isvc . Run a prediction \u00b6 Prepare the inputs for the inference request. Copy the following Json into a file named input.json . Now, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = custom-transformer-collocation MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use curl to send the inference request as: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output * Trying 127 .0.0.1:8080... * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/mnist:predict HTTP/1.1 > Host: custom-transformer-collocation.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Type: application/json > Content-Length: 427 > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 19 < content-type: application/json < date: Sat, 02 Dec 2023 09 :13:16 GMT < server: istio-envoy < x-envoy-upstream-service-time: 315 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Collocate transformer and predictor"},{"location":"modelserving/v1beta1/transformer/collocation/#collocate-transformer-and-predictor-in-same-pod","text":"KServe by default deploys the Transformer and Predictor as separate services, allowing you to deploy them on different devices and scale them independently. Nevertheless, there are certain situations where you might prefer to collocate the transformer and predictor within the same pod. Here are a few scenarios: If your transformer is tightly coupled with the predictor and you want to perform canary deployment together. If you want to reduce sidecar resources. If you want to reduce networking latency.","title":"Collocate transformer and predictor in same pod"},{"location":"modelserving/v1beta1/transformer/collocation/#before-you-begin","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository.","title":"Before you begin"},{"location":"modelserving/v1beta1/transformer/collocation/#deploy-the-inferenceservice","text":"Since, the predictor and the transformer are in the same pod, they need to listen on different ports to avoid conflict. Transformer is configured to listen on port 8080 (REST) and 8081 (GRPC) while, Predictor listens on port 8085 (REST). Transformer calls Predictor on port 8085 via local socket. Deploy the Inferenceservice using the below command. cat <<EOF | kubectl apply -f - apiVersion: serving.kserve.io/v1beta1 kind: InferenceService metadata: name: custom-transformer-collocation spec: predictor: containers: - name: kserve-container # Do not change the name; This should be the predictor container image: \"pytorch/torchserve:0.9.0-cpu\" args: - \"torchserve\" - \"--start\" - \"--model-store=/mnt/models/model-store\" - \"--ts-config=/mnt/models/config/config.properties\" env: - name: TS_SERVICE_ENVELOPE value: kserve - name: STORAGE_URI # This will trigger storage initializer; Should be only present in predictor container value: \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi - name: transformer-container # Do not change the container name image: kserve/image-transformer:latest args: - --model_name=mnist - --protocol=v1 # protocol of the predictor; used for converting the input to specific protocol supported by the predictor - --http_port=8080 - --grpc_port=8081 - --predictor_host=localhost:8085 # predictor listening port ports: - containerPort: 8080 protocol: TCP resources: requests: cpu: 100m memory: 256Mi limits: cpu: 1 memory: 1Gi EOF Expected output $ inferenceservice.serving.kserve.io/custom-transformer-collocation created Warning Always use the transformer container name as transformer-container . Otherwise, the model volume is not mounted to the transformer container which may result in an error. Warning Always use the predictor container name as kserve-container . Kserve internally uses this name to find out the predictor. The storage uri should be only present in this container. If it is specified in the transformer container the isvc creation will fail. Note Currently, The collocation support is limited to the custom container spec for kserve model container. Note In Serverless mode, Specifying ports for predictor will result in isvc creation failure as specifying multiple ports is not supported by knative. Due to this limitation predictor cannot be exposed to the outside cluster. For more info see, knative discussion on multiple ports .","title":"Deploy the InferenceService"},{"location":"modelserving/v1beta1/transformer/collocation/#check-inferenceservice-status","text":"kubectl get isvc custom-transformer-collocation Expected output NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE custom-transformer-collocation http://custom-transformer-collocation.default.example.com True 100 custom-transformer-collocation-predictor-00001 133m Note If your DNS contains svc.cluster.local , then Inferenceservice is not exposed through Ingress. you need to configure DNS or use a custom domain in order to expose the isvc .","title":"Check InferenceService status"},{"location":"modelserving/v1beta1/transformer/collocation/#run-a-prediction","text":"Prepare the inputs for the inference request. Copy the following Json into a file named input.json . Now, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = custom-transformer-collocation MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use curl to send the inference request as: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output * Trying 127 .0.0.1:8080... * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/mnist:predict HTTP/1.1 > Host: custom-transformer-collocation.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Type: application/json > Content-Length: 427 > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 19 < content-type: application/json < date: Sat, 02 Dec 2023 09 :13:16 GMT < server: istio-envoy < x-envoy-upstream-service-time: 315 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/feast/","text":"Deploy InferenceService with Transformer using Feast online feature store \u00b6 Transformer is an InferenceService component which does pre/post processing alongside with model inference. In this example, instead of typical input transformation of raw data to tensors, we demonstrate a use case of online feature augmentation as part of preprocessing. We use a Feast Transformer to gather online features, run inference with a SKLearn predictor, and leave post processing as pass-through. Before you begin \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Note This example uses Feast version 0.30.2 Create the Redis server \u00b6 This example uses the Redis as the online store. Deploy the Redis server using the below command. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: name: redis-server spec: replicas: 1 selector: matchLabels: app: redis-server template: metadata: labels: app: redis-server name: redis-server spec: containers: - name: redis-server image: redis args: [ \"--appendonly\", \"yes\" ] ports: - name: redis-server containerPort: 6379 env: - name: ALLOW_EMPTY_PASSWORD value: \"yes\" --- apiVersion: v1 kind: Service metadata: name: redis-service spec: type: LoadBalancer selector: app: redis-server ports: - protocol: TCP port: 6379 targetPort: 6379 EOF Expected output $ deployment.apps/redis-server created $ service/redis-service created Create the Feast server \u00b6 Build Feature Store Initializer docker image \u00b6 The feature store initializer is a init container which initializes a new sample feature repository, populate the online store with sample driver data and copies the feature repository to the volume mount. The feature store initializer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /feature-store-initializer:latest -f feature_store_initializer.Dockerfile . docker push $USERNAME /feature-store-initializer:latest Build Feast server docker image \u00b6 The feast server dockerfile can be found in the code example directory. docker build -t $USERNAME /feast-server:latest -f feast_server.Dockerfile . docker push $USERNAME /feast-server:latest Deploy Feast server \u00b6 Wait until the Redis Deployment is available. Now, update the init container and container's image field in the below command and deploy the Feast server. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: name: feature-server spec: replicas: 1 selector: matchLabels: app: feature-server template: metadata: labels: app: feature-server name: feature-server spec: initContainers: - name: feature-store-initializer image: \"{username}/feature-store-initializer:latest\" volumeMounts: - mountPath: /mnt name: feature-store-volume containers: - name: feature-server image: \"{username}/feast-server:latest\" args: [ -c, /mnt/driver_feature_repo/feature_repo, serve, -h, 0.0.0.0 ] ports: - name: feature-server containerPort: 6566 resources: requests: memory: \"64Mi\" cpu: \"250m\" limits: memory: \"128Mi\" cpu: \"500m\" volumeMounts: - mountPath: /mnt name: feature-store-volume volumes: - name: feature-store-volume emptyDir: sizeLimit: 100Mi --- apiVersion: v1 kind: Service metadata: name: feature-server-service spec: type: LoadBalancer selector: app: feature-server ports: - protocol: TCP port: 6566 targetPort: 6566 EOF Expected output $ deployment.apps/feature-server created $ service/feature-server-service created Create a Transformer with Feast \u00b6 Extend the Model class and implement pre/post processing functions \u00b6 KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, when predictor_host is passed the predict handler by default makes a HTTP call to the predictor url and gets back a response which then passes to postproces handler. KServe automatically fills in the predictor_host for Transformer and handle the call to the Predictor , for gRPC predictor currently you would need to overwrite the predict handler to make the gRPC call. To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. We created a class, DriverTransformer, which extends Model for this driver ranking example. It takes additional arguments for the transformer to interact with Feast: feast_serving_url : The Feast serving URL, in the form of <host_name:port> or <ip:port> entity_id_name : The name of the entity ID for which to retrieve features from the Feast feature store feature_refs : The feature references for the features to be retrieved Build Transformer docker image \u00b6 The driver transformer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /driver-transformer:latest -f driver_transformer.Dockerfile . docker push $USERNAME /driver-transformer:latest Create the InferenceService \u00b6 In the Feast Transformer image we packaged the driver transformer class so KServe knows to use the preprocess implementation to augment inputs with online features before making model inference requests. Then the InferenceService uses SKLearn to serve the driver ranking model , which is trained with Feast offline features, available in a gcs bucket specified under storageUri . Update the container's image field and the feast_serving_url argument to create the InferenceService , which includes a Feast Transformer and a SKLearn Predictor. New Schema Old Schema cat <<EOF | kubectl apply -f - apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-driver-transformer\" spec: transformer: containers: - image: \"kserve/driver-transformer:latest\" name: driver-container command: - \"python\" - \"-m\" - \"driver_transformer\" args: - --feast_serving_url - \"feature-server-service.default.svc.cluster.local:6566\" - --entity_id_name - \"driver_id\" - --feature_refs - \"driver_hourly_stats:conv_rate\" - \"driver_hourly_stats:acc_rate\" - \"driver_hourly_stats:avg_daily_trips\" predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/feast/driver\" EOF cat <<EOF | kubectl apply -f - apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-driver-transformer\" spec: transformer: containers: - image: \"kserve/driver-transformer:latest\" name: driver-container command: - \"python\" - \"-m\" - \"driver_transformer\" args: - --feast_serving_url - \"feature-server-service.default.svc.cluster.local:6566\" - --entity_id_name - \"driver_id\" - --feature_refs - \"driver_hourly_stats:conv_rate\" - \"driver_hourly_stats:acc_rate\" - \"driver_hourly_stats:avg_daily_trips\" predictor: sklearn: storageUri: \"gs://kfserving-examples/models/feast/driver\" EOF Expected output $ inferenceservice.serving.kserve.io/sklearn-driver-transformer created Run a prediction \u00b6 Prepare the inputs for the inference request. Copy the following Json into a file named driver-input.json . { \"instances\" : [[ 1001 ], [ 1002 ], [ 1003 ], [ 1004 ], [ 1005 ]] } Before testing the InferenceService , first check if it is in ready state. Now, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = sklearn-driver-transformer MODEL_NAME = sklearn-driver-transformer INPUT_PATH = @./driver-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output > POST /v1/models/sklearn-driver-transformer:predict HTTP/1.1 > Host: sklearn-driver-transformer.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Length: 57 > Content-Type: application/x-www-form-urlencoded > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 115 < content-type: application/json < date: Thu, 30 Mar 2023 09 :46:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 112 < * Connection #0 to host 1.2.3.4 left intact { \"predictions\" : [ 0 .45905828209879473,1.5118208033011165,0.21514156911776539,0.5555778492605103,0.49638665080127176 ]}","title":"Feast"},{"location":"modelserving/v1beta1/transformer/feast/#deploy-inferenceservice-with-transformer-using-feast-online-feature-store","text":"Transformer is an InferenceService component which does pre/post processing alongside with model inference. In this example, instead of typical input transformation of raw data to tensors, we demonstrate a use case of online feature augmentation as part of preprocessing. We use a Feast Transformer to gather online features, run inference with a SKLearn predictor, and leave post processing as pass-through.","title":"Deploy InferenceService with Transformer using Feast online feature store"},{"location":"modelserving/v1beta1/transformer/feast/#before-you-begin","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Note This example uses Feast version 0.30.2","title":"Before you begin"},{"location":"modelserving/v1beta1/transformer/feast/#create-the-redis-server","text":"This example uses the Redis as the online store. Deploy the Redis server using the below command. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: name: redis-server spec: replicas: 1 selector: matchLabels: app: redis-server template: metadata: labels: app: redis-server name: redis-server spec: containers: - name: redis-server image: redis args: [ \"--appendonly\", \"yes\" ] ports: - name: redis-server containerPort: 6379 env: - name: ALLOW_EMPTY_PASSWORD value: \"yes\" --- apiVersion: v1 kind: Service metadata: name: redis-service spec: type: LoadBalancer selector: app: redis-server ports: - protocol: TCP port: 6379 targetPort: 6379 EOF Expected output $ deployment.apps/redis-server created $ service/redis-service created","title":"Create the Redis server"},{"location":"modelserving/v1beta1/transformer/feast/#create-the-feast-server","text":"","title":"Create the Feast server"},{"location":"modelserving/v1beta1/transformer/feast/#build-feature-store-initializer-docker-image","text":"The feature store initializer is a init container which initializes a new sample feature repository, populate the online store with sample driver data and copies the feature repository to the volume mount. The feature store initializer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /feature-store-initializer:latest -f feature_store_initializer.Dockerfile . docker push $USERNAME /feature-store-initializer:latest","title":"Build Feature Store Initializer docker image"},{"location":"modelserving/v1beta1/transformer/feast/#build-feast-server-docker-image","text":"The feast server dockerfile can be found in the code example directory. docker build -t $USERNAME /feast-server:latest -f feast_server.Dockerfile . docker push $USERNAME /feast-server:latest","title":"Build Feast server docker image"},{"location":"modelserving/v1beta1/transformer/feast/#deploy-feast-server","text":"Wait until the Redis Deployment is available. Now, update the init container and container's image field in the below command and deploy the Feast server. cat <<EOF | kubectl apply -f - apiVersion: apps/v1 kind: Deployment metadata: name: feature-server spec: replicas: 1 selector: matchLabels: app: feature-server template: metadata: labels: app: feature-server name: feature-server spec: initContainers: - name: feature-store-initializer image: \"{username}/feature-store-initializer:latest\" volumeMounts: - mountPath: /mnt name: feature-store-volume containers: - name: feature-server image: \"{username}/feast-server:latest\" args: [ -c, /mnt/driver_feature_repo/feature_repo, serve, -h, 0.0.0.0 ] ports: - name: feature-server containerPort: 6566 resources: requests: memory: \"64Mi\" cpu: \"250m\" limits: memory: \"128Mi\" cpu: \"500m\" volumeMounts: - mountPath: /mnt name: feature-store-volume volumes: - name: feature-store-volume emptyDir: sizeLimit: 100Mi --- apiVersion: v1 kind: Service metadata: name: feature-server-service spec: type: LoadBalancer selector: app: feature-server ports: - protocol: TCP port: 6566 targetPort: 6566 EOF Expected output $ deployment.apps/feature-server created $ service/feature-server-service created","title":"Deploy Feast server"},{"location":"modelserving/v1beta1/transformer/feast/#create-a-transformer-with-feast","text":"","title":"Create a Transformer with Feast"},{"location":"modelserving/v1beta1/transformer/feast/#extend-the-model-class-and-implement-prepost-processing-functions","text":"KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, when predictor_host is passed the predict handler by default makes a HTTP call to the predictor url and gets back a response which then passes to postproces handler. KServe automatically fills in the predictor_host for Transformer and handle the call to the Predictor , for gRPC predictor currently you would need to overwrite the predict handler to make the gRPC call. To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. We created a class, DriverTransformer, which extends Model for this driver ranking example. It takes additional arguments for the transformer to interact with Feast: feast_serving_url : The Feast serving URL, in the form of <host_name:port> or <ip:port> entity_id_name : The name of the entity ID for which to retrieve features from the Feast feature store feature_refs : The feature references for the features to be retrieved","title":"Extend the Model class and implement pre/post processing functions"},{"location":"modelserving/v1beta1/transformer/feast/#build-transformer-docker-image","text":"The driver transformer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /driver-transformer:latest -f driver_transformer.Dockerfile . docker push $USERNAME /driver-transformer:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/transformer/feast/#create-the-inferenceservice","text":"In the Feast Transformer image we packaged the driver transformer class so KServe knows to use the preprocess implementation to augment inputs with online features before making model inference requests. Then the InferenceService uses SKLearn to serve the driver ranking model , which is trained with Feast offline features, available in a gcs bucket specified under storageUri . Update the container's image field and the feast_serving_url argument to create the InferenceService , which includes a Feast Transformer and a SKLearn Predictor. New Schema Old Schema cat <<EOF | kubectl apply -f - apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-driver-transformer\" spec: transformer: containers: - image: \"kserve/driver-transformer:latest\" name: driver-container command: - \"python\" - \"-m\" - \"driver_transformer\" args: - --feast_serving_url - \"feature-server-service.default.svc.cluster.local:6566\" - --entity_id_name - \"driver_id\" - --feature_refs - \"driver_hourly_stats:conv_rate\" - \"driver_hourly_stats:acc_rate\" - \"driver_hourly_stats:avg_daily_trips\" predictor: model: modelFormat: name: sklearn storageUri: \"gs://kfserving-examples/models/feast/driver\" EOF cat <<EOF | kubectl apply -f - apiVersion: \"serving.kserve.io/v1beta1\" kind: \"InferenceService\" metadata: name: \"sklearn-driver-transformer\" spec: transformer: containers: - image: \"kserve/driver-transformer:latest\" name: driver-container command: - \"python\" - \"-m\" - \"driver_transformer\" args: - --feast_serving_url - \"feature-server-service.default.svc.cluster.local:6566\" - --entity_id_name - \"driver_id\" - --feature_refs - \"driver_hourly_stats:conv_rate\" - \"driver_hourly_stats:acc_rate\" - \"driver_hourly_stats:avg_daily_trips\" predictor: sklearn: storageUri: \"gs://kfserving-examples/models/feast/driver\" EOF Expected output $ inferenceservice.serving.kserve.io/sklearn-driver-transformer created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/transformer/feast/#run-a-prediction","text":"Prepare the inputs for the inference request. Copy the following Json into a file named driver-input.json . { \"instances\" : [[ 1001 ], [ 1002 ], [ 1003 ], [ 1004 ], [ 1005 ]] } Before testing the InferenceService , first check if it is in ready state. Now, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = sklearn-driver-transformer MODEL_NAME = sklearn-driver-transformer INPUT_PATH = @./driver-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output > POST /v1/models/sklearn-driver-transformer:predict HTTP/1.1 > Host: sklearn-driver-transformer.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Length: 57 > Content-Type: application/x-www-form-urlencoded > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 115 < content-type: application/json < date: Thu, 30 Mar 2023 09 :46:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 112 < * Connection #0 to host 1.2.3.4 left intact { \"predictions\" : [ 0 .45905828209879473,1.5118208033011165,0.21514156911776539,0.5555778492605103,0.49638665080127176 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/","text":"Deploy Transformer with InferenceService \u00b6 Transformer is an InferenceService component which does pre/post processing alongside with model inference. It usually takes raw input and transforms them to the input tensors model server expects. In this example we demonstrate an example of running inference with a custom Transformer communicating by REST and gRPC protocol. Create Custom Image Transformer \u00b6 Implement pre/post processing with KServe Model API \u00b6 KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence where the output of the preprocess handler is passed to the predict handler as the input. When predictor_host is passed, the predict handler makes a call to the predictor and gets back a response which is then passed to the postprocess handler. KServe automatically fills in the predictor_host for Transformer and hands over the call to the Predictor . By default transformer makes a REST call to predictor, to make a gRPC call to predictor, you can pass the --protocol argument with value grpc-v2 . To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. import argparse from kserve import Model , ModelServer , model_server , InferInput , InferRequest , logging from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import base64 import kserve def image_transform ( byte_array ): \"\"\"converts the input image of Bytes Array into Tensor Args: instance (dict): The request input for image bytes. Returns: list: Returns converted tensor as input for predict handler with v1/v2 inference protocol. \"\"\" image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor # for v1 REST predictor the preprocess handler converts to input image bytes to float tensor dict in v1 inference REST protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . ready = True def preprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return { 'instances' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]]} def postprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return inputs # for v2 gRPC predictor the preprocess handler converts the input image bytes tensor to float tensor in v2 inference protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol self . ready = True def preprocess ( self , request : InferRequest , headers : Dict [ str , str ] = None ) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request Please see the code example here . Transformer Server Entrypoint \u00b6 For single model you just create a transformer object and register that to the model server. if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) # Configure kserve and uvicorn logger model = ImageTransformer ( args . model_name , predictor_host = args . predictor_host , protocol = args . protocol ) ModelServer () . start ( models = [ model ]) For multi-model case if all the models can share the same transformer you can register the same transformer for different models, or different transformers if each model requires its own transformation. if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) # Configure kserve and uvicorn logger for model_name in model_names : transformer = ImageTransformer ( model_name , predictor_host = args . predictor_host ) models . append ( transformer ) kserve . ModelServer () . start ( models = models ) Configuring Logger for Serving Runtime \u00b6 Kserve allows users to override the default logger configuration of serving runtime and uvicorn server. You can follow the logger configuration documentation to configure the logger. Build Transformer docker image \u00b6 Under kserve/python directory, build the transformer docker image using Dockerfile cd python docker build -t $DOCKER_USER /image-transformer:latest -f custom_transformer.Dockerfile . docker push { username } /image-transformer:latest Deploy the InferenceService with REST Predictor \u00b6 Create the InferenceService \u00b6 By default InferenceService uses TorchServe to serve the PyTorch models and the models can be loaded from a model repository in cloud storage according to TorchServe model repository layout. In this example, the model repository contains a MNIST model, but you can store more than one model there. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Note STORAGE_URI is a build-in environment variable used to inject the storage initializer for custom container just like StorageURI field for prepackaged predictors. The downloaded artifacts are stored under /mnt/models . Apply the InferenceService transformer-new.yaml kubectl apply -f transformer-new.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transformer created Run a prediction \u00b6 First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_NAME = torch-transformer MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output > POST /v1/models/mnist:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 401 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 401 out of 401 bytes Handling connection for 8080 * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 20 < content-type: application/json ; charset = UTF-8 < date: Tue, 12 Jan 2021 09 :52:30 GMT < server: istio-envoy < x-envoy-upstream-service-time: 83 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]} Deploy the InferenceService calling Predictor with gRPC protocol \u00b6 Comparing with REST, gRPC is faster due to the tight packing of the Protocol Buffer and the use of HTTP/2 by gRPC. In many cases, gRPC can be more efficient communication protocol between Transformer and Predictor as you may need to transmit large tensors between them. Create InferenceService \u00b6 Create the InferenceService with following yaml which includes a Transformer and a Triton Predictor. As KServe by default uses TorchServe serving runtime for PyTorch model, here you need to override the serving runtime to kserve-tritonserver for using the gRPC protocol. The transformer calls out to predictor with V2 gRPC Protocol by specifying the --protocol argument. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchscript runtime : kserve-tritonserver runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 Apply the InferenceService grpc_transformer.yaml kubectl apply -f grpc_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-grpc-transformer created Run a prediction \u00b6 First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = torch-grpc-transformer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output * Trying ::1... * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 3394 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > Handling connection for 8080 < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json ; charset = UTF-8 < date: Thu, 03 Feb 2022 01 :50:07 GMT < server: istio-envoy < x-envoy-upstream-service-time: 73 < * Connection #0 to host localhost left intact { \"predictions\" : [[ -1.192867636680603, -0.35750141739845276, -2.3665435314178467, 3 .9186441898345947, -2.0592284202575684, 4 .091977119445801, 0 .1266237050294876, -1.8284690380096436, 2 .628898859024048, -4.255198001861572 ]]} * Closing connection 0 Performance Comparison between gRPC and REST \u00b6 From the following latency stats of both transformer and predictor you can see that the transformer to predictor call takes longer time(92ms vs 55ms) for REST than gRPC, REST takes more time serializing and deserializing 3*32*32 shape tensor and with gRPC it is transmitted as tightly packed numpy array serialized bytes. # from REST v1 transformer log 2023 -01-09 07 :15:55.263 79476 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 6 .083965302, explain_ms: 0 , predict_ms: 92 .653036118, postprocess_ms: 0 .007867813 # from REST v1 predictor log 2023 -01-09 07 :16:02.581 79402 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 13 .532876968, explain_ms: 0 , predict_ms: 48 .450231552, postprocess_ms: 0 .006914139 # from REST v1 transformer log 2023 -01-09 07 :27:52.172 79715 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 2 .567052841, explain_ms: 0 , predict_ms: 55 .0532341, postprocess_ms: 0 .101804733 # from gPPC v2 predictor log 2023 -01-09 07 :27:52.171 79711 root INFO [ __call__ () :128 ] requestId: , preprocess_ms: 0 .067949295, explain_ms: 0 , predict_ms: 51 .237106323, postprocess_ms: 0 .049114227","title":"How to write a custom transformer"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-transformer-with-inferenceservice","text":"Transformer is an InferenceService component which does pre/post processing alongside with model inference. It usually takes raw input and transforms them to the input tensors model server expects. In this example we demonstrate an example of running inference with a custom Transformer communicating by REST and gRPC protocol.","title":"Deploy Transformer with InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-custom-image-transformer","text":"","title":"Create Custom Image Transformer"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#implement-prepost-processing-with-kserve-model-api","text":"KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence where the output of the preprocess handler is passed to the predict handler as the input. When predictor_host is passed, the predict handler makes a call to the predictor and gets back a response which is then passed to the postprocess handler. KServe automatically fills in the predictor_host for Transformer and hands over the call to the Predictor . By default transformer makes a REST call to predictor, to make a gRPC call to predictor, you can pass the --protocol argument with value grpc-v2 . To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. import argparse from kserve import Model , ModelServer , model_server , InferInput , InferRequest , logging from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import base64 import kserve def image_transform ( byte_array ): \"\"\"converts the input image of Bytes Array into Tensor Args: instance (dict): The request input for image bytes. Returns: list: Returns converted tensor as input for predict handler with v1/v2 inference protocol. \"\"\" image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor # for v1 REST predictor the preprocess handler converts to input image bytes to float tensor dict in v1 inference REST protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . ready = True def preprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return { 'instances' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]]} def postprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return inputs # for v2 gRPC predictor the preprocess handler converts the input image bytes tensor to float tensor in v2 inference protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol self . ready = True def preprocess ( self , request : InferRequest , headers : Dict [ str , str ] = None ) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request Please see the code example here .","title":"Implement pre/post processing with KServe Model API"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#transformer-server-entrypoint","text":"For single model you just create a transformer object and register that to the model server. if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) # Configure kserve and uvicorn logger model = ImageTransformer ( args . model_name , predictor_host = args . predictor_host , protocol = args . protocol ) ModelServer () . start ( models = [ model ]) For multi-model case if all the models can share the same transformer you can register the same transformer for different models, or different transformers if each model requires its own transformation. if __name__ == \"__main__\" : if args . configure_logging : logging . configure_logging ( args . log_config_file ) # Configure kserve and uvicorn logger for model_name in model_names : transformer = ImageTransformer ( model_name , predictor_host = args . predictor_host ) models . append ( transformer ) kserve . ModelServer () . start ( models = models )","title":"Transformer Server Entrypoint"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#configuring-logger-for-serving-runtime","text":"Kserve allows users to override the default logger configuration of serving runtime and uvicorn server. You can follow the logger configuration documentation to configure the logger.","title":"Configuring Logger for Serving Runtime"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#build-transformer-docker-image","text":"Under kserve/python directory, build the transformer docker image using Dockerfile cd python docker build -t $DOCKER_USER /image-transformer:latest -f custom_transformer.Dockerfile . docker push { username } /image-transformer:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-the-inferenceservice-with-rest-predictor","text":"","title":"Deploy the InferenceService with REST Predictor"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-the-inferenceservice","text":"By default InferenceService uses TorchServe to serve the PyTorch models and the models can be loaded from a model repository in cloud storage according to TorchServe model repository layout. In this example, the model repository contains a MNIST model, but you can store more than one model there. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Note STORAGE_URI is a build-in environment variable used to inject the storage initializer for custom container just like StorageURI field for prepackaged predictors. The downloaded artifacts are stored under /mnt/models . Apply the InferenceService transformer-new.yaml kubectl apply -f transformer-new.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transformer created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#run-a-prediction","text":"First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_NAME = torch-transformer MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output > POST /v1/models/mnist:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 401 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 401 out of 401 bytes Handling connection for 8080 * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 20 < content-type: application/json ; charset = UTF-8 < date: Tue, 12 Jan 2021 09 :52:30 GMT < server: istio-envoy < x-envoy-upstream-service-time: 83 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-the-inferenceservice-calling-predictor-with-grpc-protocol","text":"Comparing with REST, gRPC is faster due to the tight packing of the Protocol Buffer and the use of HTTP/2 by gRPC. In many cases, gRPC can be more efficient communication protocol between Transformer and Predictor as you may need to transmit large tensors between them.","title":"Deploy the InferenceService calling Predictor with gRPC protocol"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-inferenceservice","text":"Create the InferenceService with following yaml which includes a Transformer and a Triton Predictor. As KServe by default uses TorchServe serving runtime for PyTorch model, here you need to override the serving runtime to kserve-tritonserver for using the gRPC protocol. The transformer calls out to predictor with V2 gRPC Protocol by specifying the --protocol argument. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchscript runtime : kserve-tritonserver runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 Apply the InferenceService grpc_transformer.yaml kubectl apply -f grpc_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-grpc-transformer created","title":"Create InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#run-a-prediction_1","text":"First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = torch-grpc-transformer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output * Trying ::1... * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 3394 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > Handling connection for 8080 < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json ; charset = UTF-8 < date: Thu, 03 Feb 2022 01 :50:07 GMT < server: istio-envoy < x-envoy-upstream-service-time: 73 < * Connection #0 to host localhost left intact { \"predictions\" : [[ -1.192867636680603, -0.35750141739845276, -2.3665435314178467, 3 .9186441898345947, -2.0592284202575684, 4 .091977119445801, 0 .1266237050294876, -1.8284690380096436, 2 .628898859024048, -4.255198001861572 ]]} * Closing connection 0","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#performance-comparison-between-grpc-and-rest","text":"From the following latency stats of both transformer and predictor you can see that the transformer to predictor call takes longer time(92ms vs 55ms) for REST than gRPC, REST takes more time serializing and deserializing 3*32*32 shape tensor and with gRPC it is transmitted as tightly packed numpy array serialized bytes. # from REST v1 transformer log 2023 -01-09 07 :15:55.263 79476 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 6 .083965302, explain_ms: 0 , predict_ms: 92 .653036118, postprocess_ms: 0 .007867813 # from REST v1 predictor log 2023 -01-09 07 :16:02.581 79402 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 13 .532876968, explain_ms: 0 , predict_ms: 48 .450231552, postprocess_ms: 0 .006914139 # from REST v1 transformer log 2023 -01-09 07 :27:52.172 79715 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 2 .567052841, explain_ms: 0 , predict_ms: 55 .0532341, postprocess_ms: 0 .101804733 # from gPPC v2 predictor log 2023 -01-09 07 :27:52.171 79711 root INFO [ __call__ () :128 ] requestId: , preprocess_ms: 0 .067949295, explain_ms: 0 , predict_ms: 51 .237106323, postprocess_ms: 0 .049114227","title":"Performance Comparison between gRPC and REST"},{"location":"modelserving/v1beta1/triton/bert/","text":"QA Inference with BERT model using Triton Inference Server \u00b6 Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This example demonstrates Inference on Question Answering (QA) task with BERT Base/Large model The use of fine-tuned NVIDIA BERT models Deploy Transformer for preprocess using BERT tokenizer Deploy BERT model on Triton Inference Server Inference with V2 KServe protocol We can run inference on a fine-tuned BERT model for tasks like Question Answering. Here we use a BERT model fine-tuned on a SQuaD 2.0 Dataset which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions. Setup \u00b6 Your cluster's Istio Ingress gateway must be network accessible . Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving Create Custom Transformer for BERT Tokenizer \u00b6 Extend ModelServer base and Implement pre/postprocess \u00b6 The preprocess handler converts the paragraph and the question to BERT input using BERT tokenizer The predict handler calls Triton Inference Server using PYTHON REST API The postprocess handler converts raw prediction to the answer with the probability class BertTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str ): super () . __init__ ( name ) self . short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\" self . predictor_host = predictor_host self . tokenizer = tokenization . FullTokenizer ( vocab_file = \"/mnt/models/vocab.txt\" , do_lower_case = True ) self . model_name = \"bert_tf_v2_large_fp16_128_v2\" self . triton_client = None def preprocess ( self , inputs : Dict ) -> Dict : self . doc_tokens = data_processing . convert_doc_tokens ( self . short_paragraph_text ) self . features = data_processing . convert_examples_to_features ( self . doc_tokens , inputs [ \"instances\" ][ 0 ], self . tokenizer , 128 , 128 , 64 ) return self . features def predict ( self , features : Dict ) -> Dict : if not self . triton_client : self . triton_client = httpclient . InferenceServerClient ( url = self . predictor_host , verbose = True ) unique_ids = np . zeros ([ 1 , 1 ], dtype = np . int32 ) segment_ids = features [ \"segment_ids\" ] . reshape ( 1 , 128 ) input_ids = features [ \"input_ids\" ] . reshape ( 1 , 128 ) input_mask = features [ \"input_mask\" ] . reshape ( 1 , 128 ) inputs = [] inputs . append ( httpclient . InferInput ( 'unique_ids' , [ 1 , 1 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'segment_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_mask' , [ 1 , 128 ], \"INT32\" )) inputs [ 0 ] . set_data_from_numpy ( unique_ids ) inputs [ 1 ] . set_data_from_numpy ( segment_ids ) inputs [ 2 ] . set_data_from_numpy ( input_ids ) inputs [ 3 ] . set_data_from_numpy ( input_mask ) outputs = [] outputs . append ( httpclient . InferRequestedOutput ( 'start_logits' , binary_data = False )) outputs . append ( httpclient . InferRequestedOutput ( 'end_logits' , binary_data = False )) result = self . triton_client . infer ( self . model_name , inputs , outputs = outputs ) return result . get_response () def postprocess ( self , result : Dict ) -> Dict : end_logits = result [ 'outputs' ][ 0 ][ 'data' ] start_logits = result [ 'outputs' ][ 1 ][ 'data' ] n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 ( prediction , nbest_json , scores_diff_json ) = \\ data_processing . get_predictions ( self . doc_tokens , self . features , start_logits , end_logits , n_best_size , max_answer_length ) return { \"predictions\" : prediction , \"prob\" : nbest_json [ 0 ][ 'probability' ] * 100.0 } Please find the code example here . Build Transformer docker image \u00b6 Build the KServe Transformer image with above code cd bert_tokenizer_v2 docker build -t $USER /bert_transformer-v2:latest . --rm Or you can use the prebuild image kfserving/bert-transformer-v2:latest Create the InferenceService \u00b6 Add above custom KServe Transformer image and Triton Predictor to the InferenceService spec apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"bert-v2\" spec : transformer : containers : - name : kserve-container image : kfserving/bert-transformer-v2:latest command : - \"python\" - \"-m\" - \"bert_transformer_v2\" env : - name : STORAGE_URI value : \"gs://kfserving-examples/models/triton/bert-transformer\" predictor : triton : runtimeVersion : 20.10-py3 resources : limits : cpu : \"1\" memory : 8Gi requests : cpu : \"1\" memory : 8Gi storageUri : \"gs://kfserving-examples/models/triton/bert\" Apply the InferenceService yaml. kubectl apply -f bert_v1beta1.yaml Expected Output $ inferenceservice.serving.kserve.io/bert-v2 created Check the InferenceService \u00b6 kubectl get inferenceservice bert-v2 NAME URL READY AGE bert-v2 http://bert-v2.default.35.229.120.99.xip.io True 71s you will see both transformer and predictor are created and in ready state kubectl get revision -l serving.kserve.io/inferenceservice = bert-v2 NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON bert-v2-predictor-default-plhgs bert-v2-predictor-default bert-v2-predictor-default-plhgs 1 True bert-v2-transformer-default-sd6nc bert-v2-transformer-default bert-v2-transformer-default-sd6nc 1 True Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send a question request with following input, the transformer expects sending a list of instances or inputs and preprocess then converts the inputs to expected tensor sending to Triton Inference Server . { \"instances\" : [ \"What President is credited with the original notion of putting Americans in space?\" ] } MODEL_NAME = bert-v2 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservices bert-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output { \"predictions\" : \"John F. Kennedy\" , \"prob\" : 77 .91848979818604 }","title":"Tensorflow"},{"location":"modelserving/v1beta1/triton/bert/#qa-inference-with-bert-model-using-triton-inference-server","text":"Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This example demonstrates Inference on Question Answering (QA) task with BERT Base/Large model The use of fine-tuned NVIDIA BERT models Deploy Transformer for preprocess using BERT tokenizer Deploy BERT model on Triton Inference Server Inference with V2 KServe protocol We can run inference on a fine-tuned BERT model for tasks like Question Answering. Here we use a BERT model fine-tuned on a SQuaD 2.0 Dataset which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions.","title":"QA Inference with BERT model using Triton Inference Server"},{"location":"modelserving/v1beta1/triton/bert/#setup","text":"Your cluster's Istio Ingress gateway must be network accessible . Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving","title":"Setup"},{"location":"modelserving/v1beta1/triton/bert/#create-custom-transformer-for-bert-tokenizer","text":"","title":"Create Custom Transformer for BERT Tokenizer"},{"location":"modelserving/v1beta1/triton/bert/#extend-modelserver-base-and-implement-prepostprocess","text":"The preprocess handler converts the paragraph and the question to BERT input using BERT tokenizer The predict handler calls Triton Inference Server using PYTHON REST API The postprocess handler converts raw prediction to the answer with the probability class BertTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str ): super () . __init__ ( name ) self . short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\" self . predictor_host = predictor_host self . tokenizer = tokenization . FullTokenizer ( vocab_file = \"/mnt/models/vocab.txt\" , do_lower_case = True ) self . model_name = \"bert_tf_v2_large_fp16_128_v2\" self . triton_client = None def preprocess ( self , inputs : Dict ) -> Dict : self . doc_tokens = data_processing . convert_doc_tokens ( self . short_paragraph_text ) self . features = data_processing . convert_examples_to_features ( self . doc_tokens , inputs [ \"instances\" ][ 0 ], self . tokenizer , 128 , 128 , 64 ) return self . features def predict ( self , features : Dict ) -> Dict : if not self . triton_client : self . triton_client = httpclient . InferenceServerClient ( url = self . predictor_host , verbose = True ) unique_ids = np . zeros ([ 1 , 1 ], dtype = np . int32 ) segment_ids = features [ \"segment_ids\" ] . reshape ( 1 , 128 ) input_ids = features [ \"input_ids\" ] . reshape ( 1 , 128 ) input_mask = features [ \"input_mask\" ] . reshape ( 1 , 128 ) inputs = [] inputs . append ( httpclient . InferInput ( 'unique_ids' , [ 1 , 1 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'segment_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_mask' , [ 1 , 128 ], \"INT32\" )) inputs [ 0 ] . set_data_from_numpy ( unique_ids ) inputs [ 1 ] . set_data_from_numpy ( segment_ids ) inputs [ 2 ] . set_data_from_numpy ( input_ids ) inputs [ 3 ] . set_data_from_numpy ( input_mask ) outputs = [] outputs . append ( httpclient . InferRequestedOutput ( 'start_logits' , binary_data = False )) outputs . append ( httpclient . InferRequestedOutput ( 'end_logits' , binary_data = False )) result = self . triton_client . infer ( self . model_name , inputs , outputs = outputs ) return result . get_response () def postprocess ( self , result : Dict ) -> Dict : end_logits = result [ 'outputs' ][ 0 ][ 'data' ] start_logits = result [ 'outputs' ][ 1 ][ 'data' ] n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 ( prediction , nbest_json , scores_diff_json ) = \\ data_processing . get_predictions ( self . doc_tokens , self . features , start_logits , end_logits , n_best_size , max_answer_length ) return { \"predictions\" : prediction , \"prob\" : nbest_json [ 0 ][ 'probability' ] * 100.0 } Please find the code example here .","title":"Extend ModelServer base and Implement pre/postprocess"},{"location":"modelserving/v1beta1/triton/bert/#build-transformer-docker-image","text":"Build the KServe Transformer image with above code cd bert_tokenizer_v2 docker build -t $USER /bert_transformer-v2:latest . --rm Or you can use the prebuild image kfserving/bert-transformer-v2:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/triton/bert/#create-the-inferenceservice","text":"Add above custom KServe Transformer image and Triton Predictor to the InferenceService spec apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"bert-v2\" spec : transformer : containers : - name : kserve-container image : kfserving/bert-transformer-v2:latest command : - \"python\" - \"-m\" - \"bert_transformer_v2\" env : - name : STORAGE_URI value : \"gs://kfserving-examples/models/triton/bert-transformer\" predictor : triton : runtimeVersion : 20.10-py3 resources : limits : cpu : \"1\" memory : 8Gi requests : cpu : \"1\" memory : 8Gi storageUri : \"gs://kfserving-examples/models/triton/bert\" Apply the InferenceService yaml. kubectl apply -f bert_v1beta1.yaml Expected Output $ inferenceservice.serving.kserve.io/bert-v2 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/bert/#check-the-inferenceservice","text":"kubectl get inferenceservice bert-v2 NAME URL READY AGE bert-v2 http://bert-v2.default.35.229.120.99.xip.io True 71s you will see both transformer and predictor are created and in ready state kubectl get revision -l serving.kserve.io/inferenceservice = bert-v2 NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON bert-v2-predictor-default-plhgs bert-v2-predictor-default bert-v2-predictor-default-plhgs 1 True bert-v2-transformer-default-sd6nc bert-v2-transformer-default bert-v2-transformer-default-sd6nc 1 True","title":"Check the InferenceService"},{"location":"modelserving/v1beta1/triton/bert/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send a question request with following input, the transformer expects sending a list of instances or inputs and preprocess then converts the inputs to expected tensor sending to Triton Inference Server . { \"instances\" : [ \"What President is credited with the original notion of putting Americans in space?\" ] } MODEL_NAME = bert-v2 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservices bert-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output { \"predictions\" : \"John F. Kennedy\" , \"prob\" : 77 .91848979818604 }","title":"Run a Prediction"},{"location":"modelserving/v1beta1/triton/huggingface/","text":"Serve the Huggingface model using Triton Inference Runtime \u00b6 Nvidia Triton Inference Server is a robust serving runtime thanks to its optmized performance, scalability, and flexibility. Combined with the expansive library of Hugging Face, which offers state-of-the-art natural language processing capabilities, it opens up immense possibilities for deploying production-ready Huggface Face transformer based models. By harnessing the power of these tools, here we'll show you how KServe can help further simplify the Triton Inference containers deployment and make efficient use of GPUs by automatically wiring up the open inference protocol between pre/post processing(tokenization) and model inference on triton inference container. Export the Model to Triton format \u00b6 Export the Hugging Face models to supported model formats Torchscript or ONNX in triton model repository layout . For more details, please refer to triton model configuration . Deploy InferenceService with Triton and Hugging Face Runtime \u00b6 Create an InferenceService with triton predictor by specifying the storageUri with the Hugging Face model stored on cloud storage according to triton model repository layout. The KServe transformer container is created using the KServe Hugging Face runtime for the tokenization step to encode the text tokens and decode the token ids from the output the triton inference container. The Hugging Face tokenizing container and triton inference container can communicate with either REST or gRPC protocol by specifiying the --predictor_protocol=v2 or --predictor_protocol=grpc-v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-triton spec : predictor : model : args : - --log-verbose=1 modelFormat : name : triton protocolVersion : v2 resources : limits : cpu : \"1\" memory : 8Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 8Gi nvidia.com/gpu : \"1\" runtimeVersion : 23.10-py3 storageUri : gs://kfserving-examples/models/triton/huggingface/model_repository transformer : containers : - args : - --model_name=bert - --model_id=bert-base-uncased - --predictor_protocol=v2 - --tensor_input_names=input_ids image : kserve/huggingfaceserver:v0.13.0 name : kserve-container resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : 100m memory : 2Gi EOF Perform Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-triton -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Perform inference using v1 REST Protocol curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d '{\"instances\": [\"The capital of france is [MASK].\"] }' Expected Output { \"predictions\" : [ \"paris\" ]}","title":"Hugging Face"},{"location":"modelserving/v1beta1/triton/huggingface/#serve-the-huggingface-model-using-triton-inference-runtime","text":"Nvidia Triton Inference Server is a robust serving runtime thanks to its optmized performance, scalability, and flexibility. Combined with the expansive library of Hugging Face, which offers state-of-the-art natural language processing capabilities, it opens up immense possibilities for deploying production-ready Huggface Face transformer based models. By harnessing the power of these tools, here we'll show you how KServe can help further simplify the Triton Inference containers deployment and make efficient use of GPUs by automatically wiring up the open inference protocol between pre/post processing(tokenization) and model inference on triton inference container.","title":"Serve the Huggingface model using Triton Inference Runtime"},{"location":"modelserving/v1beta1/triton/huggingface/#export-the-model-to-triton-format","text":"Export the Hugging Face models to supported model formats Torchscript or ONNX in triton model repository layout . For more details, please refer to triton model configuration .","title":"Export the Model to Triton format"},{"location":"modelserving/v1beta1/triton/huggingface/#deploy-inferenceservice-with-triton-and-hugging-face-runtime","text":"Create an InferenceService with triton predictor by specifying the storageUri with the Hugging Face model stored on cloud storage according to triton model repository layout. The KServe transformer container is created using the KServe Hugging Face runtime for the tokenization step to encode the text tokens and decode the token ids from the output the triton inference container. The Hugging Face tokenizing container and triton inference container can communicate with either REST or gRPC protocol by specifiying the --predictor_protocol=v2 or --predictor_protocol=grpc-v2 . Yaml kubectl apply -f - <<EOF apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : huggingface-triton spec : predictor : model : args : - --log-verbose=1 modelFormat : name : triton protocolVersion : v2 resources : limits : cpu : \"1\" memory : 8Gi nvidia.com/gpu : \"1\" requests : cpu : \"1\" memory : 8Gi nvidia.com/gpu : \"1\" runtimeVersion : 23.10-py3 storageUri : gs://kfserving-examples/models/triton/huggingface/model_repository transformer : containers : - args : - --model_name=bert - --model_id=bert-base-uncased - --predictor_protocol=v2 - --tensor_input_names=input_ids image : kserve/huggingfaceserver:v0.13.0 name : kserve-container resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : 100m memory : 2Gi EOF","title":"Deploy InferenceService with Triton and Hugging Face Runtime"},{"location":"modelserving/v1beta1/triton/huggingface/#perform-model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = bert SERVICE_HOSTNAME = $( kubectl get inferenceservice huggingface-triton -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Perform inference using v1 REST Protocol curl -H \"content-type:application/json\" -H \"Host: ${ SERVICE_HOSTNAME } \" -v http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d '{\"instances\": [\"The capital of france is [MASK].\"] }' Expected Output { \"predictions\" : [ \"paris\" ]}","title":"Perform Model Inference"},{"location":"modelserving/v1beta1/triton/torchscript/","text":"Predict on a Triton InferenceService with TorchScript model \u00b6 While Python is a suitable and preferred language for many scenarios requiring dynamism and ease of iteration, there are equally many situations where precisely these properties of Python are unfavorable. One environment in which the latter often applies is production \u2013 the land of low latencies and strict deployment requirements. For production scenarios, C++ is very often the language of choice, The following example will outline the path PyTorch provides to go from an existing Python model to a serialized representation that can be loaded and executed purely from C++ like Triton Inference Server, with no dependency on Python. Setup \u00b6 Make sure you have installed KServe Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving Export as Torchscript Model \u00b6 A PyTorch model\u2019s journey from Python to C++ is enabled by Torch Script , a representation of a PyTorch model that can be understood, compiled and serialized by the Torch Script compiler. If you are starting out from an existing PyTorch model written in the vanilla eager API, you must first convert your model to Torch Script. Convert the above model via Tracing and serialize the script module to a file import torch # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. example = torch . rand ( 1 , 3 , 32 , 32 ) traced_script_module = torch . jit . trace ( net , example ) traced_script_module . save ( \"model.pt\" ) Store your trained model on cloud storage in a Model Repository \u00b6 Once the model is exported as TorchScript model file, the next step is to upload the model to a GCS bucket. Triton supports loading multiple models so it expects a model repository which follows a required layout in the bucket. <model-repository-path>/ <model-name>/ [config.pbtxt] [<output-labels-file> ...] <version>/ <model-definition-file> <version>/ <model-definition-file> ... <model-name>/ [config.pbtxt] [<output-labels-file> ...] <version>/ <model-definition-file> <version>/ <model-definition-file> For example in your model repository bucket gs://kfserving-examples/models/torchscript , the layout can be torchscript/ cifar/ config.pbtxt 1/ model.pt The config.pbtxt defines a model configuration that provides the required and optional information for the model. A minimal model configuration must specify name, platform, max_batch_size, input, and output. Due to the absence of names for inputs and outputs in a TorchScript model, the name attribute of both the inputs and outputs in the configuration must follow a specific naming convention i.e. \u201c __ \u201d. Where can be any string and refers to the position of the corresponding input/output. This means if there are two inputs and two outputs they must be named as: INPUT__0 , INPUT__1 and OUTPUT__0 , OUTPUT__1 such that INPUT__0 refers to first input and INPUT__1 refers to the second input, etc. na me : \"cifar\" pla tf orm : \"pytorch_libtorch\" max_ba t ch_size : 1 i n pu t [ { na me : \"INPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 3 , 32 , 32 ] } ] ou t pu t [ { na me : \"OUTPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 10 ] } ] i nstan ce_group [ { cou nt : 1 ki n d : KIND_CPU } ] instance_group provides multiple instances of a model so that multiple inference requests for that model can be handled simultaneously. instance_group [ { count: 4 kind: KIND_CPU } ] To schedule the model on GPU you would need to change the instance_group with GPU kind instance_group [ { count: 1 kind: KIND_GPU } ] For more details, please refer to triton model configuration . Inference with HTTP endpoint \u00b6 Create the InferenceService \u00b6 Create the inference service yaml with the above specified model repository uri. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" Warning Setting OMP_NUM_THREADS or MKL_NUM_THREADS envs are critical for performance, these environment variables are used to control the intra-op parallelism for TorchScript model inference, the number of CPU threads defaults to the number of CPU cores. Please refer to CPU threading & TorchScript Inference for more details. kubectl kubectl apply -f torchscript.yaml Expected Output $ inferenceservice.serving.kserve.io/torchscript-cifar10 created Run a prediction with curl \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT The latest Triton Inference Server already switched to use KServe prediction V2 protocol , so the input request needs to follow the V2 schema with the specified data type, shape. # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/input.json MODEL_NAME = cifar10 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d $INPUT_PATH Expected Output * Connected to torchscript-cifar.default.svc.cluster.local ( 10 .51.242.87 ) port 80 ( #0) > POST /v2/models/cifar10/infer HTTP/1.1 > Host: torchscript-cifar.default.svc.cluster.local > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 110765 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 315 < content-type: application/json < date: Sun, 11 Oct 2020 21 :26:51 GMT < x-envoy-upstream-service-time: 8 < server: istio-envoy < * Connection #0 to host torchscript-cifar.default.svc.cluster.local left intact { \"model_name\" : \"cifar10\" , \"model_version\" : \"1\" , \"outputs\" : [{ \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ 1 ,10 ] , \"data\" : [ -2.0964810848236086,-0.13700756430625916,-0.5095657706260681,2.795621395111084,-0.5605481863021851,1.9934231042861939,1.1288187503814698,-1.4043136835098267,0.6004879474639893,-2.1237082481384279 ]}]} Run a performance test \u00b6 QPS rate --rate can be changed in the perf.yaml . kubectl create -f perf.yaml Requests [ total, rate, throughput ] 6000 , 100 .02, 100 .01 Duration [ total, attack, wait ] 59 .995s, 59 .99s, 4 .961ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 4 .222ms, 5 .7ms, 5 .548ms, 6 .384ms, 6 .743ms, 9 .286ms, 25 .85ms Bytes In [ total, mean ] 1890000 , 315 .00 Bytes Out [ total, mean ] 665874000 , 110979 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :6000 Error Set: Inference with gRPC endpoint \u00b6 Create the InferenceService \u00b6 Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - containerPort : 9000 name : h2c protocol : TCP env : - name : OMP_NUM_THREADS value : \"1\" Apply the gRPC InferenceService yaml and then you can call the model with tritonclient python library after InferenceService is ready. kubectl apply -f torchscript_grpc.yaml Run a prediction with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/predict-api/v2/grpc_predict_v2.proto # download the input json file curl -O https://raw.githubusercontent.com/kserve/website/main/docs/modelserving/v1beta1/triton/torchscript/input-grpc.json INPUT_PATH = input-grpc.json PROTO_FILE = grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The gRPC APIs follow the KServe prediction V2 protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \" \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: host: torchscript-cifar10.default.example.com Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Fri, 12 Aug 2022 01 :49:53 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 16 Response contents: { \"modelName\" : \"cifar10\" , \"modelVersion\" : \"1\" , \"outputs\" : [ { \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"10\" ] } ] , \"rawOutputContents\" : [ \"wCwGwOJLDL7icgK/dusyQAqAD799KP8/In2QP4zAs7+WuRk/2OoHwA==\" ] } Response trailers received: ( empty ) Sent 1 request and received 1 response The content of output tensor is encoded in rawOutputContents field. It can be base64 decoded and loaded into a Numpy array with the given datatype and shape. Alternatively, Triton also provides Python client library which has many examples showing how to interact with the KServe V2 gPRC protocol. Add Transformer to the InferenceService \u00b6 Triton Inference Server expects tensors as input data, often times a pre-processing step is required before making the prediction call when the user is sending in request with raw input format. Transformer component can be specified on InferenceService spec for user implemented pre/post processing code. User is responsible to create a python class which extends from KServe Model base class which implements preprocess handler to transform raw input format to tensor format according to V2 prediction protocol, postprocess handle is to convert raw prediction response to a more user friendly response. Implement pre/post processing functions \u00b6 image_transformer_v2.py import kserve from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import numpy as np import base64 logging . basicConfig ( level = kserve . constants . KSERVE_LOGLEVEL ) transform = transforms . Compose ( [ transforms . ToTensor (), transforms . Normalize (( 0.5 , 0.5 , 0.5 ), ( 0.5 , 0.5 , 0.5 ))]) def image_transform ( instance ): byte_array = base64 . b64decode ( instance [ 'image_bytes' ][ 'b64' ]) image = Image . open ( io . BytesIO ( byte_array )) a = np . asarray ( image ) im = Image . fromarray ( a ) res = transform ( im ) logging . info ( res ) return res . tolist () class ImageTransformerV2 ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol def preprocess ( self , inputs : Dict ) -> Dict : return { 'inputs' : [ { 'name' : 'INPUT__0' , 'shape' : [ 1 , 3 , 32 , 32 ], 'datatype' : \"FP32\" , 'data' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]] } ] } def postprocess ( self , results : Dict ) -> Dict : return { output [ \"name\" ]: np . array ( output [ \"data\" ]) . reshape ( output [ \"shape\" ]) . tolist () for output in results [ \"outputs\" ]} Please find the code example and Dockerfile . Build Transformer docker image \u00b6 docker build -t $DOCKER_USER /image-transformer-v2:latest -f transformer.Dockerfile . --rm Create the InferenceService with Transformer \u00b6 Please use the YAML file to create the InferenceService, which adds the image transformer component with the docker image built from above. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transfomer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" transformer : containers : - image : kfserving/image-transformer-v2:latest name : kserve-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 kubectl apply -f torch_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transfomer created Run a prediction with curl \u00b6 The transformer does not enforce a specific schema like predictor but the general recommendation is to send in as a list of object(dict): \"instances\": <value>|<list-of-objects> { \"instances\" : [ { \"image_bytes\" : { \"b64\" : \"aW1hZ2UgYnl0ZXM=\" }, \"caption\" : \"seaside\" }, { \"image_bytes\" : { \"b64\" : \"YXdlc29tZSBpbWFnZSBieXRlcw==\" }, \"caption\" : \"mountains\" } ] } # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/image.json SERVICE_NAME = torch-transfomer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.kserve-triton.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3400 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 219 < content-type: application/json ; charset = UTF-8 < date: Sat, 19 Mar 2022 12 :15:54 GMT < server: istio-envoy < x-envoy-upstream-service-time: 41 < { \"OUTPUT__0\" : [[ -2.0964810848236084, -0.137007474899292, -0.5095658302307129, 2 .795621395111084, -0.560547947883606, 1 .9934231042861938, 1 .1288189888000488, - 4043136835098267 , 0 .600488007068634, -2.1237082481384277 ]]} %","title":"Torchscript"},{"location":"modelserving/v1beta1/triton/torchscript/#predict-on-a-triton-inferenceservice-with-torchscript-model","text":"While Python is a suitable and preferred language for many scenarios requiring dynamism and ease of iteration, there are equally many situations where precisely these properties of Python are unfavorable. One environment in which the latter often applies is production \u2013 the land of low latencies and strict deployment requirements. For production scenarios, C++ is very often the language of choice, The following example will outline the path PyTorch provides to go from an existing Python model to a serialized representation that can be loaded and executed purely from C++ like Triton Inference Server, with no dependency on Python.","title":"Predict on a Triton InferenceService with TorchScript model"},{"location":"modelserving/v1beta1/triton/torchscript/#setup","text":"Make sure you have installed KServe Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving","title":"Setup"},{"location":"modelserving/v1beta1/triton/torchscript/#export-as-torchscript-model","text":"A PyTorch model\u2019s journey from Python to C++ is enabled by Torch Script , a representation of a PyTorch model that can be understood, compiled and serialized by the Torch Script compiler. If you are starting out from an existing PyTorch model written in the vanilla eager API, you must first convert your model to Torch Script. Convert the above model via Tracing and serialize the script module to a file import torch # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. example = torch . rand ( 1 , 3 , 32 , 32 ) traced_script_module = torch . jit . trace ( net , example ) traced_script_module . save ( \"model.pt\" )","title":"Export as Torchscript Model"},{"location":"modelserving/v1beta1/triton/torchscript/#store-your-trained-model-on-cloud-storage-in-a-model-repository","text":"Once the model is exported as TorchScript model file, the next step is to upload the model to a GCS bucket. Triton supports loading multiple models so it expects a model repository which follows a required layout in the bucket. <model-repository-path>/ <model-name>/ [config.pbtxt] [<output-labels-file> ...] <version>/ <model-definition-file> <version>/ <model-definition-file> ... <model-name>/ [config.pbtxt] [<output-labels-file> ...] <version>/ <model-definition-file> <version>/ <model-definition-file> For example in your model repository bucket gs://kfserving-examples/models/torchscript , the layout can be torchscript/ cifar/ config.pbtxt 1/ model.pt The config.pbtxt defines a model configuration that provides the required and optional information for the model. A minimal model configuration must specify name, platform, max_batch_size, input, and output. Due to the absence of names for inputs and outputs in a TorchScript model, the name attribute of both the inputs and outputs in the configuration must follow a specific naming convention i.e. \u201c __ \u201d. Where can be any string and refers to the position of the corresponding input/output. This means if there are two inputs and two outputs they must be named as: INPUT__0 , INPUT__1 and OUTPUT__0 , OUTPUT__1 such that INPUT__0 refers to first input and INPUT__1 refers to the second input, etc. na me : \"cifar\" pla tf orm : \"pytorch_libtorch\" max_ba t ch_size : 1 i n pu t [ { na me : \"INPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 3 , 32 , 32 ] } ] ou t pu t [ { na me : \"OUTPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 10 ] } ] i nstan ce_group [ { cou nt : 1 ki n d : KIND_CPU } ] instance_group provides multiple instances of a model so that multiple inference requests for that model can be handled simultaneously. instance_group [ { count: 4 kind: KIND_CPU } ] To schedule the model on GPU you would need to change the instance_group with GPU kind instance_group [ { count: 1 kind: KIND_GPU } ] For more details, please refer to triton model configuration .","title":"Store your trained model on cloud storage in a Model Repository"},{"location":"modelserving/v1beta1/triton/torchscript/#inference-with-http-endpoint","text":"","title":"Inference with HTTP endpoint"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice","text":"Create the inference service yaml with the above specified model repository uri. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" Warning Setting OMP_NUM_THREADS or MKL_NUM_THREADS envs are critical for performance, these environment variables are used to control the intra-op parallelism for TorchScript model inference, the number of CPU threads defaults to the number of CPU cores. Please refer to CPU threading & TorchScript Inference for more details. kubectl kubectl apply -f torchscript.yaml Expected Output $ inferenceservice.serving.kserve.io/torchscript-cifar10 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-curl","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT The latest Triton Inference Server already switched to use KServe prediction V2 protocol , so the input request needs to follow the V2 schema with the specified data type, shape. # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/input.json MODEL_NAME = cifar10 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d $INPUT_PATH Expected Output * Connected to torchscript-cifar.default.svc.cluster.local ( 10 .51.242.87 ) port 80 ( #0) > POST /v2/models/cifar10/infer HTTP/1.1 > Host: torchscript-cifar.default.svc.cluster.local > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 110765 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 315 < content-type: application/json < date: Sun, 11 Oct 2020 21 :26:51 GMT < x-envoy-upstream-service-time: 8 < server: istio-envoy < * Connection #0 to host torchscript-cifar.default.svc.cluster.local left intact { \"model_name\" : \"cifar10\" , \"model_version\" : \"1\" , \"outputs\" : [{ \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ 1 ,10 ] , \"data\" : [ -2.0964810848236086,-0.13700756430625916,-0.5095657706260681,2.795621395111084,-0.5605481863021851,1.9934231042861939,1.1288187503814698,-1.4043136835098267,0.6004879474639893,-2.1237082481384279 ]}]}","title":"Run a prediction with curl"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-performance-test","text":"QPS rate --rate can be changed in the perf.yaml . kubectl create -f perf.yaml Requests [ total, rate, throughput ] 6000 , 100 .02, 100 .01 Duration [ total, attack, wait ] 59 .995s, 59 .99s, 4 .961ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 4 .222ms, 5 .7ms, 5 .548ms, 6 .384ms, 6 .743ms, 9 .286ms, 25 .85ms Bytes In [ total, mean ] 1890000 , 315 .00 Bytes Out [ total, mean ] 665874000 , 110979 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :6000 Error Set:","title":"Run a performance test"},{"location":"modelserving/v1beta1/triton/torchscript/#inference-with-grpc-endpoint","text":"","title":"Inference with gRPC endpoint"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice_1","text":"Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - containerPort : 9000 name : h2c protocol : TCP env : - name : OMP_NUM_THREADS value : \"1\" Apply the gRPC InferenceService yaml and then you can call the model with tritonclient python library after InferenceService is ready. kubectl apply -f torchscript_grpc.yaml","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/predict-api/v2/grpc_predict_v2.proto # download the input json file curl -O https://raw.githubusercontent.com/kserve/website/main/docs/modelserving/v1beta1/triton/torchscript/input-grpc.json INPUT_PATH = input-grpc.json PROTO_FILE = grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The gRPC APIs follow the KServe prediction V2 protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \" \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: host: torchscript-cifar10.default.example.com Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Fri, 12 Aug 2022 01 :49:53 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 16 Response contents: { \"modelName\" : \"cifar10\" , \"modelVersion\" : \"1\" , \"outputs\" : [ { \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"10\" ] } ] , \"rawOutputContents\" : [ \"wCwGwOJLDL7icgK/dusyQAqAD799KP8/In2QP4zAs7+WuRk/2OoHwA==\" ] } Response trailers received: ( empty ) Sent 1 request and received 1 response The content of output tensor is encoded in rawOutputContents field. It can be base64 decoded and loaded into a Numpy array with the given datatype and shape. Alternatively, Triton also provides Python client library which has many examples showing how to interact with the KServe V2 gPRC protocol.","title":"Run a prediction with grpcurl"},{"location":"modelserving/v1beta1/triton/torchscript/#add-transformer-to-the-inferenceservice","text":"Triton Inference Server expects tensors as input data, often times a pre-processing step is required before making the prediction call when the user is sending in request with raw input format. Transformer component can be specified on InferenceService spec for user implemented pre/post processing code. User is responsible to create a python class which extends from KServe Model base class which implements preprocess handler to transform raw input format to tensor format according to V2 prediction protocol, postprocess handle is to convert raw prediction response to a more user friendly response.","title":"Add Transformer to the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#implement-prepost-processing-functions","text":"image_transformer_v2.py import kserve from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import numpy as np import base64 logging . basicConfig ( level = kserve . constants . KSERVE_LOGLEVEL ) transform = transforms . Compose ( [ transforms . ToTensor (), transforms . Normalize (( 0.5 , 0.5 , 0.5 ), ( 0.5 , 0.5 , 0.5 ))]) def image_transform ( instance ): byte_array = base64 . b64decode ( instance [ 'image_bytes' ][ 'b64' ]) image = Image . open ( io . BytesIO ( byte_array )) a = np . asarray ( image ) im = Image . fromarray ( a ) res = transform ( im ) logging . info ( res ) return res . tolist () class ImageTransformerV2 ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol def preprocess ( self , inputs : Dict ) -> Dict : return { 'inputs' : [ { 'name' : 'INPUT__0' , 'shape' : [ 1 , 3 , 32 , 32 ], 'datatype' : \"FP32\" , 'data' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]] } ] } def postprocess ( self , results : Dict ) -> Dict : return { output [ \"name\" ]: np . array ( output [ \"data\" ]) . reshape ( output [ \"shape\" ]) . tolist () for output in results [ \"outputs\" ]} Please find the code example and Dockerfile .","title":"Implement pre/post processing functions"},{"location":"modelserving/v1beta1/triton/torchscript/#build-transformer-docker-image","text":"docker build -t $DOCKER_USER /image-transformer-v2:latest -f transformer.Dockerfile . --rm","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice-with-transformer","text":"Please use the YAML file to create the InferenceService, which adds the image transformer component with the docker image built from above. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transfomer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" transformer : containers : - image : kfserving/image-transformer-v2:latest name : kserve-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 kubectl apply -f torch_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transfomer created","title":"Create the InferenceService with Transformer"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-curl_1","text":"The transformer does not enforce a specific schema like predictor but the general recommendation is to send in as a list of object(dict): \"instances\": <value>|<list-of-objects> { \"instances\" : [ { \"image_bytes\" : { \"b64\" : \"aW1hZ2UgYnl0ZXM=\" }, \"caption\" : \"seaside\" }, { \"image_bytes\" : { \"b64\" : \"YXdlc29tZSBpbWFnZSBieXRlcw==\" }, \"caption\" : \"mountains\" } ] } # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/image.json SERVICE_NAME = torch-transfomer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.kserve-triton.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3400 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 219 < content-type: application/json ; charset = UTF-8 < date: Sat, 19 Mar 2022 12 :15:54 GMT < server: istio-envoy < x-envoy-upstream-service-time: 41 < { \"OUTPUT__0\" : [[ -2.0964810848236084, -0.137007474899292, -0.5095658302307129, 2 .795621395111084, -0.560547947883606, 1 .9934231042861938, 1 .1288189888000488, - 4043136835098267 , 0 .600488007068634, -2.1237082481384277 ]]} %","title":"Run a prediction with curl"},{"location":"modelserving/v1beta1/xgboost/","text":"Deploying XGBoost models with InferenceService \u00b6 This example walks you through how to deploy a xgboost model using KServe's InferenceService CRD. Note that, by default it exposes your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through an API compatible with the Open Inference Protocol . Train the Model \u00b6 The first step will be to train a sample xgboost model. We will save this model as model.bst . import xgboost as xgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = xgb . DMatrix ( X , label = y ) param = { 'max_depth' : 6 , 'eta' : 0.1 , 'silent' : 1 , 'nthread' : 4 , 'num_class' : 10 , 'objective' : 'multi:softmax' } xgb_model = xgb . train ( params = param , dtrain = dtrain ) model_file = os . path . join (( model_dir ), BST_FILE ) xgb_model . save_model ( model_file ) Test the model locally \u00b6 Once you've got your model serialized model.bst , we can then use KServe XGBoost Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use kserve xgboost server locally, you will first need to install the xgbserver runtime package in your local environment. Clone the Kserve repository and navigate into the directory. git clone https://github.com/kserve/kserve Install xgbserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/xgbserver poetry install Serving model locally \u00b6 The xgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the xgbserver runtime package installed locally, you should now be ready to start our server as: python3 xgbserver --model_dir /path/to/model_dir --model_name xgboost-v2-iris Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, we use KServe to deploy our trained model on Kubernetes. For this, we use the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Assuming that we've got a cluster accessible through kubectl with KServe already installed, we can deploy our model as: kubectl apply -f xgboost.yaml Test the Deployed Model \u00b6 We can now test our deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/xgboost-v2-iris/infer The output will be something similar to: Expected Output { \"id\" : \"4e546709-0887-490a-abd6-00cbc4c26cf4\" , \"model_name\" : \"xgboost-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1.0 , 1.0 ], \"datatype\" : \"FP32\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f xgboost-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-iris-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"xgboost-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"xgboost-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"XGBoost"},{"location":"modelserving/v1beta1/xgboost/#deploying-xgboost-models-with-inferenceservice","text":"This example walks you through how to deploy a xgboost model using KServe's InferenceService CRD. Note that, by default it exposes your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through an API compatible with the Open Inference Protocol .","title":"Deploying XGBoost models with InferenceService"},{"location":"modelserving/v1beta1/xgboost/#train-the-model","text":"The first step will be to train a sample xgboost model. We will save this model as model.bst . import xgboost as xgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = xgb . DMatrix ( X , label = y ) param = { 'max_depth' : 6 , 'eta' : 0.1 , 'silent' : 1 , 'nthread' : 4 , 'num_class' : 10 , 'objective' : 'multi:softmax' } xgb_model = xgb . train ( params = param , dtrain = dtrain ) model_file = os . path . join (( model_dir ), BST_FILE ) xgb_model . save_model ( model_file )","title":"Train the Model"},{"location":"modelserving/v1beta1/xgboost/#test-the-model-locally","text":"Once you've got your model serialized model.bst , we can then use KServe XGBoost Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the model locally"},{"location":"modelserving/v1beta1/xgboost/#pre-requisites","text":"Firstly, to use kserve xgboost server locally, you will first need to install the xgbserver runtime package in your local environment. Clone the Kserve repository and navigate into the directory. git clone https://github.com/kserve/kserve Install xgbserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/xgbserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/xgboost/#serving-model-locally","text":"The xgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the xgbserver runtime package installed locally, you should now be ready to start our server as: python3 xgbserver --model_dir /path/to/model_dir --model_name xgboost-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/xgboost/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, we use KServe to deploy our trained model on Kubernetes. For this, we use the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Assuming that we've got a cluster accessible through kubectl with KServe already installed, we can deploy our model as: kubectl apply -f xgboost.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/xgboost/#test-the-deployed-model","text":"We can now test our deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/xgboost-v2-iris/infer The output will be something similar to: Expected Output { \"id\" : \"4e546709-0887-490a-abd6-00cbc4c26cf4\" , \"model_name\" : \"xgboost-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1.0 , 1.0 ], \"datatype\" : \"FP32\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/xgboost/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format <protocol>[-<suffix>] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f xgboost-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/xgboost/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-iris-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"xgboost-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"xgboost-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"python_runtime_api/docs/","text":"KServe Python Runtime API \u00b6 KServe's python runtime API implements a standardized python model server API following open inference protocol . It encapsulates data plane API definitions and storage retrieval for models. It provides many functionalities, including among others: Implements the data plane API following open inference protocol. Provide extensible model server and model API. Allow customizing pre-processing, prediction and post-processing handlers. Readiness and liveness Handlers. Installation \u00b6 KServe Python SDK can be installed by pip or poetry . pip install \u00b6 pip install kserve Poetry \u00b6 Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install API Reference \u00b6 Please refer to API docs . Storage API \u00b6 The storage API is used by KServe Storage Initializer which supports the following cloud storage providers. The storage package is optional and can be installed via pip install kserve [ storage ] Google Cloud Storage with a prefix: \"gs://\" By default, it uses GOOGLE_APPLICATION_CREDENTIALS environment variable for user authentication. If GOOGLE_APPLICATION_CREDENTIALS is not provided, anonymous client will be used to download the artifacts. S3 Compatible Object Storage with a prefix \"s3://\" For static credentials it uses S3_ENDPOINT , AWS_ACCESS_KEY_ID , and AWS_SECRET_ACCESS_KEY environment variables for authentication. Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} By default, it uses anonymous client to download the artifacts. For e.g. https://kfserving.blob.core.windows.net/triton/simple_string/ Persistent Volume Claim (PVC) with the format pvc://{$pvcname}/[path] . The pvcname is the name of the PVC that contains the model. The [path] is the relative path to the model on the PVC. For e.g. pvc://mypvcname/model/path/on/pvc Generic URI over either HTTP prefixed with http:// or HTTPS prefixed with https:// . For example: https://<some_url>.com/model.joblib http://<some_url>.com/model.zip","title":"Python Runtime Server SDK"},{"location":"python_runtime_api/docs/#kserve-python-runtime-api","text":"KServe's python runtime API implements a standardized python model server API following open inference protocol . It encapsulates data plane API definitions and storage retrieval for models. It provides many functionalities, including among others: Implements the data plane API following open inference protocol. Provide extensible model server and model API. Allow customizing pre-processing, prediction and post-processing handlers. Readiness and liveness Handlers.","title":"KServe Python Runtime API"},{"location":"python_runtime_api/docs/#installation","text":"KServe Python SDK can be installed by pip or poetry .","title":"Installation"},{"location":"python_runtime_api/docs/#pip-install","text":"pip install kserve","title":"pip install"},{"location":"python_runtime_api/docs/#poetry","text":"Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install","title":"Poetry"},{"location":"python_runtime_api/docs/#api-reference","text":"Please refer to API docs .","title":"API Reference"},{"location":"python_runtime_api/docs/#storage-api","text":"The storage API is used by KServe Storage Initializer which supports the following cloud storage providers. The storage package is optional and can be installed via pip install kserve [ storage ] Google Cloud Storage with a prefix: \"gs://\" By default, it uses GOOGLE_APPLICATION_CREDENTIALS environment variable for user authentication. If GOOGLE_APPLICATION_CREDENTIALS is not provided, anonymous client will be used to download the artifacts. S3 Compatible Object Storage with a prefix \"s3://\" For static credentials it uses S3_ENDPOINT , AWS_ACCESS_KEY_ID , and AWS_SECRET_ACCESS_KEY environment variables for authentication. Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} By default, it uses anonymous client to download the artifacts. For e.g. https://kfserving.blob.core.windows.net/triton/simple_string/ Persistent Volume Claim (PVC) with the format pvc://{$pvcname}/[path] . The pvcname is the name of the PVC that contains the model. The [path] is the relative path to the model on the PVC. For e.g. pvc://mypvcname/model/path/on/pvc Generic URI over either HTTP prefixed with http:// or HTTPS prefixed with https:// . For example: https://<some_url>.com/model.joblib http://<some_url>.com/model.zip","title":"Storage API"},{"location":"python_runtime_api/docs/api/","text":"KServe Python Serving Runtime API \u00b6 ModelServer \u00b6 Source code in kserve/model_server.py 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 class ModelServer : def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : Optional [ ModelRepository ] = None , enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: A optional Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None``. it allows to override only the `uvicorn.access`'s format configuration with a richer set of fields (output hardcoded to `stdout`). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn [github issue](https://github.com/encode/uvicorn/issues/527) for more info). \"\"\" self . registered_models = ( ModelRepository () if registered_models is None else registered_models ) self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = self . registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension , kwargs = vars ( args ), ) if args . configure_logging : # If the logger does not have any handlers, then the logger is not configured. # For backward compatibility, we configure the logger here. if len ( logger . handlers ) == 0 : logging . configure_logging ( args . log_config_file ) self . access_log_format = access_log_format self . _custom_exception_handler = None async def _serve_rest ( self ): logger . info ( f \"Starting uvicorn with { self . workers } workers\" ) loop = asyncio . get_event_loop () if sys . platform not in [ \"win32\" , \"win64\" ]: sig_list = [ signal . SIGINT , signal . SIGTERM , signal . SIGQUIT ] else : sig_list = [ signal . SIGINT , signal . SIGTERM ] for sig in sig_list : loop . add_signal_handler ( sig , lambda s = sig : asyncio . create_task ( self . stop ( sig = s )) ) if self . _custom_exception_handler is None : loop . set_exception_handler ( self . default_exception_handler ) else : loop . set_exception_handler ( self . _custom_exception_handler ) self . _rest_server = UvicornServer ( app , self . http_port , self . dataplane , self . model_repository_extension , # By setting log_config to None we tell Uvicorn not to configure logging as it is already # configured by kserve. log_config = None , access_log_format = self . access_log_format , workers = self . workers , ) await self . _rest_server . run () def start ( self , models : List [ BaseKServeModel ]) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): at_least_one_model_ready = False for model in models : if isinstance ( model , BaseKServeModel ): if model . ready : at_least_one_model_ready = True self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging model . start () else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) if not at_least_one_model_ready and models : raise NoModelReady ( models ) else : raise RuntimeError ( \"Unknown model collection type\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def servers_task (): servers = [ self . _serve_rest ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" if \"exception\" in context : logger . error ( f \"Caught exception: { context . get ( 'exception' ) } \" ) logger . error ( f \"message: { context . get ( 'message' ) } \" ) loop . default_exception_handler ( context ) def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name ) __init__ ( http_port = args . http_port , grpc_port = args . grpc_port , workers = args . workers , max_threads = args . max_threads , max_asyncio_workers = args . max_asyncio_workers , registered_models = None , enable_grpc = args . enable_grpc , enable_docs_url = args . enable_docs_url , enable_latency_logging = args . enable_latency_logging , access_log_format = args . access_log_format ) \u00b6 KServe ModelServer Constructor Parameters: Name Type Description Default http_port int HTTP port. Default: 8080 . http_port grpc_port int GRPC port. Default: 8081 . grpc_port workers int Number of uvicorn workers. Default: 1 . workers max_threads int Max number of gRPC processing threads. Default: 4 max_threads max_asyncio_workers int Max number of AsyncIO threads. Default: None max_asyncio_workers registered_models Optional [ ModelRepository ] A optional Model repository with registered models. None enable_grpc bool Whether to turn on grpc server. Default: True enable_grpc enable_docs_url bool Whether to turn on /docs Swagger UI. Default: False . enable_docs_url enable_latency_logging bool Whether to log latency metric. Default: True . enable_latency_logging access_log_format str Format to set for the access log (provided by asgi-logger). Default: None . it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). access_log_format Source code in kserve/model_server.py 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : Optional [ ModelRepository ] = None , enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: A optional Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None``. it allows to override only the `uvicorn.access`'s format configuration with a richer set of fields (output hardcoded to `stdout`). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn [github issue](https://github.com/encode/uvicorn/issues/527) for more info). \"\"\" self . registered_models = ( ModelRepository () if registered_models is None else registered_models ) self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = self . registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension , kwargs = vars ( args ), ) if args . configure_logging : # If the logger does not have any handlers, then the logger is not configured. # For backward compatibility, we configure the logger here. if len ( logger . handlers ) == 0 : logging . configure_logging ( args . log_config_file ) self . access_log_format = access_log_format self . _custom_exception_handler = None default_exception_handler ( loop , context ) \u00b6 Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. Source code in kserve/model_server.py 341 342 343 344 345 346 347 348 349 350 351 352 def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" if \"exception\" in context : logger . error ( f \"Caught exception: { context . get ( 'exception' ) } \" ) logger . error ( f \"message: { context . get ( 'message' ) } \" ) loop . default_exception_handler ( context ) register_exception_handler ( handler ) \u00b6 Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see call_exception_handler() documentation for details about context). Source code in kserve/model_server.py 327 328 329 330 331 332 333 334 335 336 337 338 339 def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler register_model ( model ) \u00b6 Register a model to the model server. Parameters: Name Type Description Default model BaseKServeModel The model object. required Source code in kserve/model_server.py 354 355 356 357 358 359 360 361 362 363 def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name ) start ( models ) \u00b6 Start the model server with a set of registered models. Parameters: Name Type Description Default models List [ BaseKServeModel ] a list of models to register to the model server. required Source code in kserve/model_server.py 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 def start ( self , models : List [ BaseKServeModel ]) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): at_least_one_model_ready = False for model in models : if isinstance ( model , BaseKServeModel ): if model . ready : at_least_one_model_ready = True self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging model . start () else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) if not at_least_one_model_ready and models : raise NoModelReady ( models ) else : raise RuntimeError ( \"Unknown model collection type\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def servers_task (): servers = [ self . _serve_rest ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) stop ( sig = None ) async \u00b6 Stop the instances of REST and gRPC model servers. Parameters: Name Type Description Default sig Optional [ int ] The signal to stop the server. Default: None . None Source code in kserve/model_server.py 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) BaseKServeModel \u00b6 Bases: ABC A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. Source code in kserve/model.py 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 class BaseKServeModel ( ABC ): \"\"\" A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. \"\"\" def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False async def healthy ( self ) -> bool : \"\"\" Check the health of this model. By default returns `self.ready`. Returns: True if healthy, false otherwise \"\"\" return self . ready def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready def start ( self ): \"\"\"Start handler can be overridden to perform model setup\"\"\" self . ready = True def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" self . ready = False __init__ ( name ) \u00b6 Adds the required attributes Parameters: Name Type Description Default name str The name of the model. required Source code in kserve/model.py 49 50 51 52 53 54 55 56 57 def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False healthy () async \u00b6 Check the health of this model. By default returns self.ready . Returns: Type Description bool True if healthy, false otherwise Source code in kserve/model.py 59 60 61 62 63 64 65 66 async def healthy ( self ) -> bool : \"\"\" Check the health of this model. By default returns `self.ready`. Returns: True if healthy, false otherwise \"\"\" return self . ready load () \u00b6 Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 68 69 70 71 72 73 74 75 76 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready start () \u00b6 Start handler can be overridden to perform model setup Source code in kserve/model.py 78 79 80 def start ( self ): \"\"\"Start handler can be overridden to perform model setup\"\"\" self . ready = True stop () \u00b6 Stop handler can be overridden to perform model teardown Source code in kserve/model.py 82 83 84 def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" self . ready = False InferenceModel \u00b6 Bases: BaseKServeModel Abstract class representing a model that supports standard inference and prediction. Source code in kserve/model.py 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 class InferenceModel ( BaseKServeModel ): \"\"\" Abstract class representing a model that supports standard inference and prediction. \"\"\" @abstractmethod def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : pass def get_input_types ( self ) -> List [ Dict ]: # Override this function to return appropriate input format expected by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] def get_output_types ( self ) -> List [ Dict ]: # Override this function to return appropriate output format returned by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] Model \u00b6 Bases: InferenceModel Source code in kserve/model.py 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 class Model ( InferenceModel ): def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response @property def _http_client ( self ) -> InferenceRESTClient : if self . _http_client_instance is None and self . predictor_host : config = RESTConfig ( protocol = self . protocol , timeout = self . timeout , retries = 3 ) self . _http_client_instance = InferenceRESTClient ( config = config ) return self . _http_client_instance @property def _grpc_client ( self ) -> InferenceGRPCClient : if self . _grpc_client_stub is None and self . predictor_host : self . _grpc_client_stub = InferenceGRPCClient ( url = self . predictor_host , use_ssl = self . use_ssl , timeout = self . timeout ) return self . _grpc_client_stub def validate ( self , payload ): if isinstance ( payload , ModelInferRequest ): return payload if isinstance ( payload , InferRequest ): return payload # TODO: validate the request if self.get_input_types() defines the input types. if self . protocol == PredictorProtocol . REST_V2 . value : if \"inputs\" in payload and not isinstance ( payload [ \"inputs\" ], list ): raise InvalidInput ( 'Expected \"inputs\" to be a list' ) elif self . protocol == PredictorProtocol . REST_V1 . value : if ( isinstance ( payload , Dict ) and \"instances\" in payload and not isinstance ( payload [ \"instances\" ], list ) ): raise InvalidInput ( 'Expected \"instances\" to be a list' ) return payload def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result async def _http_predict ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: # Adjusting headers. Inject content type if not exist. # Also, removing host, as the header is the one passed to transformer and contains transformer's host predict_headers = { \"Content-Type\" : \"application/json\" } if headers is not None : if \"x-request-id\" in headers : predict_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : predict_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" predict_base_url = PREDICTOR_BASE_URL_FORMAT . format ( protocol , self . predictor_host ) response = await self . _http_client . infer ( predict_base_url , model_name = self . name , data = payload , headers = predict_headers , ) return response async def _grpc_predict ( self , payload : Union [ ModelInferRequest , InferRequest ], headers : Dict [ str , str ] = None , ) -> InferResponse : if isinstance ( payload , ModelInferRequest ): payload = InferRequest . from_grpc ( payload ) async_result = await self . _grpc_client . infer ( infer_request = payload , headers = ( ( \"request_type\" , \"grpc_v2\" ), ( \"response_type\" , \"grpc_v2\" ), ( \"x-request-id\" , headers . get ( \"x-request-id\" , \"\" )), ), ) return async_result async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : return await self . _grpc_predict ( payload , headers ) else : return await self . _http_predict ( payload , headers ) async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) explain_headers = { \"content-type\" : \"application/json\" } if headers is not None : if \"content-type\" in headers : explain_headers [ \"content-type\" ] = headers [ \"content-type\" ] if \"x-request-id\" in headers : explain_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : explain_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_base_url = EXPLAINER_BASE_URL_FORMAT . format ( protocol , self . explainer_host ) response = await self . _http_client . explain ( explain_base_url , model_name = self . name , data = payload , headers = explain_headers , ) return response __call__ ( body , headers = None , verb = InferenceVerb . PREDICT ) async \u00b6 Method to call predictor or explainer with the given input. Parameters: Name Type Description Default body Union [ Dict , CloudEvent , InferRequest ] Request body. required verb InferenceVerb The inference verb for predict/generate/explain PREDICT headers Optional [ Dict [ str , str ]] Request headers. None Returns: Type Description InferReturnType Response output from preprocess -> predict/generate/explain -> postprocess Source code in kserve/model.py 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response __init__ ( name , predictor_config = None ) \u00b6 KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Parameters: Name Type Description Default name str The name of the model. required predictor_config Optional [ PredictorConfig ] The configurations for http call to the predictor. None Source code in kserve/model.py 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False explain ( payload , headers = None ) async \u00b6 explain handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if explainer_host is specified. Parameters: Name Type Description Default payload Dict Explainer model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Dict An Explanation for the inference result. Source code in kserve/model.py 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) explain_headers = { \"content-type\" : \"application/json\" } if headers is not None : if \"content-type\" in headers : explain_headers [ \"content-type\" ] = headers [ \"content-type\" ] if \"x-request-id\" in headers : explain_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : explain_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_base_url = EXPLAINER_BASE_URL_FORMAT . format ( protocol , self . explainer_host ) response = await self . _http_client . explain ( explain_base_url , model_name = self . name , data = payload , headers = explain_headers , ) return response load () \u00b6 Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 298 299 300 301 302 303 304 305 306 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready postprocess ( result , headers = None ) async \u00b6 The postprocess handler can be overridden for inference result or response transformation. The predictor sends back the inference result in Dict for v1 endpoints and InferResponse for v2 endpoints. Parameters: Name Type Description Default result Union [ Dict , InferResponse ] The inference result passed from predict handler or the HTTP response from predictor. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse ] A Dict or InferResponse after post-process to return back to the client. Source code in kserve/model.py 325 326 327 328 329 330 331 332 333 334 335 336 337 338 async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result predict ( payload , headers = None ) async \u00b6 The predict handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Parameters: Name Type Description Default payload Union [ Dict , InferRequest , ModelInferRequest ] Model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse , AsyncIterator [ Any ]] Inference result or a Response from the predictor. Source code in kserve/model.py 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : return await self . _grpc_predict ( payload , headers ) else : return await self . _http_predict ( payload , headers ) preprocess ( payload , headers = None ) async \u00b6 preprocess handler can be overridden for data or feature transformation. The model decodes the request body to Dict for v1 endpoints and InferRequest for v2 endpoints. Parameters: Name Type Description Default payload Union [ Dict , InferRequest ] Payload of the request. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferRequest ] A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Union [ Dict , InferRequest ] Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. Source code in kserve/model.py 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload PredictorConfig \u00b6 Source code in kserve/model.py 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 class PredictorConfig : def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds __init__ ( predictor_host , predictor_protocol = PredictorProtocol . REST_V1 . value , predictor_use_ssl = False , predictor_request_timeout_seconds = 600 ) \u00b6 The configuration for the http call to the predictor Parameters: Name Type Description Default predictor_host str The host name of the predictor required predictor_protocol str The inference protocol used for predictor http call REST_V1 .value predictor_use_ssl bool Enable using ssl for http connection to the predictor False predictor_request_timeout_seconds int The request timeout seconds for the predictor http call 600 Source code in kserve/model.py 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds InferInput \u00b6 Source code in kserve/protocol/infer_type.py 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 class InferInput : _name : str _shape : List [ int ] _datatype : str _parameters : Dict def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference input associated with this object. Returns: The name of the inference input \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the datatype of inference input associated with this object. Returns: The datatype of the inference input. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of the inference input associated with this object. Returns: The data of the inference input. \"\"\" return self . _data @data . setter def data ( self , data : List ): \"\"\"Set the data of the inference input associated with this object. Args: data: data of the inference input. \"\"\" self . _data = data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference input associated with this object. Returns: The shape of the inference input. \"\"\" return self . _shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of the inference input associated with this object. Returns: The additional inference parameters \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] ): \"\"\"Set the parameters of the inference input associated with this object. Args: params: parameters of the inference input \"\"\" self . _parameters = params @shape . setter def shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference input. Args: shape : The shape of the associated inference input. \"\"\" self . _shape = shape def as_string ( self ) -> List [ List [ str ]]: \"\"\" Decodes the inference input data as a list of strings. Returns: List[List[str]]: The decoded data as a list of strings. Raises: InvalidInput: If the datatype of the inference input is not 'BYTES'. \"\"\" if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" ) def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data Raises: InvalidInput: If the datatype of the inference input is not recognized. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferInput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False if self . _raw_data != other . _raw_data : return False return True @parameters . setter def parameters ( self , value ): self . _parameters = value def to_dict ( self ) -> dict : return { \"name\" : self . name , \"shape\" : self . shape , \"datatype\" : self . datatype , \"data\" : self . data , \"parameters\" : self . parameters , } def __repr__ ( self ) -> str : return ( f '\"name\": \" { self . name } \",' f '\"shape\": { self . shape } ,' f '\"datatype\": \" { self . datatype } \",' f '\"data\": { self . data } ,' f '\"parameters\": { self . parameters } ' ) def __str__ ( self ) -> str : return self . __repr__ () data : Union [ List , np . ndarray , InferTensorContents ] property writable \u00b6 Get the data of the inference input associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of the inference input. datatype : str property \u00b6 Get the datatype of inference input associated with this object. Returns: Type Description str The datatype of the inference input. name : str property \u00b6 Get the name of inference input associated with this object. Returns: Type Description str The name of the inference input parameters : Union [ Dict , MessageMap [ str , InferParameter ], None ] property writable \u00b6 Get the parameters of the inference input associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters shape : List [ int ] property writable \u00b6 Get the shape of inference input associated with this object. Returns: Type Description List [ int ] The shape of the inference input. __init__ ( name , shape , datatype , data = None , parameters = None ) \u00b6 An object of InferInput class is used to describe the input tensor of an inference request. Parameters: Name Type Description Default name str The name of the inference input whose data will be described by this object. required shape The shape of the associated inference input. required datatype The data type of the associated inference input. required data The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using set_data_from_numpy . None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None as_numpy () \u00b6 Decode the inference input data as numpy array. Returns: Type Description ndarray A numpy array of the inference input data Source code in kserve/protocol/infer_type.py 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data Raises: InvalidInput: If the datatype of the inference input is not recognized. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) as_string () \u00b6 Decodes the inference input data as a list of strings. Returns: Type Description List [ List [ str ]] List[List[str]]: The decoded data as a list of strings. Raises: Type Description InvalidInput If the datatype of the inference input is not 'BYTES'. Source code in kserve/protocol/infer_type.py 220 221 222 223 224 225 226 227 228 229 230 231 232 233 def as_string ( self ) -> List [ List [ str ]]: \"\"\" Decodes the inference input data as a list of strings. Returns: List[List[str]]: The decoded data as a list of strings. Raises: InvalidInput: If the datatype of the inference input is not 'BYTES'. \"\"\" if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" ) set_data_from_numpy ( input_tensor , binary_data = True ) \u00b6 Set the tensor data from the specified numpy array for input associated with this object. Parameters: Name Type Description Default input_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) InferOutput \u00b6 Source code in kserve/protocol/infer_type.py 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 class InferOutput : def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference output associated with this object. Returns: The name of inference output. \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the data type of inference output associated with this object. Returns: The data type of inference output. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of inference output associated with this object. Returns: The data of inference output. \"\"\" return self . _data @data . setter def data ( self , data : Union [ List , np . ndarray , InferTensorContents ]): \"\"\"Set the data of inference output associated with this object. Args: data: inference output data. \"\"\" self . _data = data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference output associated with this object. Returns: The shape of inference output \"\"\" return self . _shape @shape . setter def shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference output. Args: shape: The shape of the associated inference output. \"\"\" self . _shape = shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of inference output associated with this object. Returns: The additional inference parameters associated with the inference output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Union [ Dict , MessageMap [ str , InferParameter ]]): \"\"\"Set the parameters of inference output associated with this object. :param params: The parameters of inference output \"\"\" self . _parameters = params def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferOutput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False if self . _raw_data != other . _raw_data : return False return True def to_dict ( self ) -> dict : return { \"name\" : self . name , \"shape\" : self . shape , \"datatype\" : self . datatype , \"data\" : self . data , \"parameters\" : self . parameters , } def __repr__ ( self ) -> str : return ( f '\"name\": \" { self . name } \",' f '\"shape\": { self . shape } ,' f '\"datatype\": \" { self . datatype } \",' f '\"data\": { self . data } ,' f '\"parameters\": { self . parameters } ' ) def __str__ ( self ) -> str : return self . __repr__ () data : Union [ List , np . ndarray , InferTensorContents ] property writable \u00b6 Get the data of inference output associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of inference output. datatype : str property \u00b6 Get the data type of inference output associated with this object. Returns: Type Description str The data type of inference output. name : str property \u00b6 Get the name of inference output associated with this object. Returns: Type Description str The name of inference output. parameters : Union [ Dict , MessageMap [ str , InferParameter ], None ] property writable \u00b6 Get the parameters of inference output associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters associated with the inference output. shape : List [ int ] property writable \u00b6 Get the shape of inference output associated with this object. Returns: Type Description List [ int ] The shape of inference output __init__ ( name , shape , datatype , data = None , parameters = None ) \u00b6 An object of InferOutput class is used to describe the output tensor for an inference response. Parameters: Name Type Description Default name The name of inference output whose data will be described by this object. required shape The shape of the associated inference output. required datatype The data type of the associated inference output. required data The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None as_numpy () \u00b6 Decode the tensor output data as numpy array. Returns: Type Description ndarray The numpy array of the associated inference output data. Source code in kserve/protocol/infer_type.py 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) set_data_from_numpy ( output_tensor , binary_data = True ) \u00b6 Set the tensor data from the specified numpy array for the inference output associated with this object. Parameters: Name Type Description Default output_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) InferRequest \u00b6 Source code in kserve/protocol/infer_type.py 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 class InferRequest : id : Optional [ str ] model_name : str parameters : Optional [ Dict ] inputs : List [ InferInput ] from_grpc : bool def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , request_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. request_outputs: The output tensors requested for this inference. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc self . _use_raw_outputs = False if raw_inputs : self . _use_raw_outputs = True for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input self . request_outputs = request_outputs @property def use_binary_outputs ( self ) -> bool : \"\"\"This attribute is used to determine if all the outputs should be returned as raw binary format. For REST, Get the binary_data_output attribute from the parameters. This will be ovverided by the individual output's 'binary_data' parameter. For GRPC, It is True, if the received inputs are raw_inputs, otherwise False. For GRPC, if the inputs are raw_inputs, then the outputs should be returned as raw_outputs. Returns: a boolean indicating whether to use binary raw outputs \"\"\" # If the request is from gRPC and we receive the inputs as raw inputs, then the outputs should be returned as raw binary format. if self . _use_raw_outputs and self . from_grpc : return True # If the request is from REST and the 'use_binary_outputs' parameter is set to True, then the outputs should be returned as raw binary format. elif self . parameters and not self . from_grpc : # If it is a grpc request, then this configuration has no effect on the output. return self . parameters . get ( \"binary_data_output\" , False ) else : return False @classmethod def from_grpc ( cls , request : ModelInferRequest ) -> \"InferRequest\" : \"\"\" Class method to construct an InferRequest object from a ModelInferRequest object. Args: request (ModelInferRequest): The gRPC ModelInferRequest object to be converted. Returns: InferRequest: The resulting InferRequest object. \"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = ( to_http_parameters ( input_tensor . parameters ) if input_tensor . parameters else None ), ) for input_tensor in request . inputs ] request_outputs = [ RequestedOutput ( name = output . name , parameters = ( to_http_parameters ( output . parameters ) if output . parameters else None ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , request_outputs = request_outputs if request_outputs else None , ) @classmethod def from_bytes ( cls , req_bytes : bytes , json_length : int , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from REST raw request bytes. Args: req_bytes (bytes): The raw InferRequest in bytes. json_length (int): The length of the json bytes. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized or if necessary fields are missing. \"\"\" json_bytes = req_bytes [: json_length ] try : infer_req_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) infer_inputs = [] # Read the raw binary inputs appended after json start_index = json_length for input_ in infer_req_dict [ \"inputs\" ]: parameters = input_ . get ( \"parameters\" , None ) infer_input = InferInput ( name = input_ [ \"name\" ], shape = input_ [ \"shape\" ], datatype = input_ [ \"datatype\" ], parameters = parameters , ) infer_input_data = input_ . get ( \"data\" , None ) if infer_input_data is not None : if infer_input . datatype == \"FP16\" : raise InvalidInput ( f \"Receiving FP16 data via JSON is not supported. Please use the binary data format \" f \"for input { infer_input . name } \" ) infer_input . data = infer_input_data elif parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" , None ) if binary_data_size is None : raise InvalidInput ( f \"'binary_data_size' is not specified for input ' { infer_input . name } ' for model ' { model_name } '\" ) end_index = start_index + binary_data_size infer_input . _raw_data = req_bytes [ start_index : end_index ] infer_input . set_data_from_numpy ( infer_input . as_numpy (), binary_data = False ) start_index = end_index else : raise InvalidInput ( f \"'data' field is missing for input ' { infer_input . name } ' for model ' { model_name } '\" ) infer_inputs . append ( infer_input ) requested_outputs = None if infer_req_dict . get ( \"outputs\" , None ) is not None : requested_outputs = [ RequestedOutput ( name = output [ \"name\" ], parameters = output . get ( \"parameters\" , None ), ) for output in infer_req_dict [ \"outputs\" ] ] return cls ( model_name = model_name , request_id = infer_req_dict . get ( \"id\" , None ), parameters = infer_req_dict . get ( \"parameters\" , None ), infer_inputs = infer_inputs , request_outputs = requested_outputs , ) @classmethod def from_inference_request ( cls , request : InferenceRequest , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from InferenceRequest object. Args: request (InferenceRequest): The InferenceRequest object. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized. \"\"\" infer_inputs = [] for infer_input in request . inputs : if infer_input . datatype == \"FP16\" and len ( infer_input . data ) != 0 : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. \" f \"Please use the binary data format for input { infer_input . name } \" ) infer_inputs . append ( InferInput ( name = infer_input . name , shape = infer_input . shape , datatype = infer_input . datatype , data = infer_input . data , parameters = ( {} if infer_input . parameters is None else infer_input . parameters ), ) ) requested_outputs = None if request . outputs : requested_outputs = [ RequestedOutput ( name = output . name , parameters = ({} if output . parameters is None else output . parameters ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = model_name , infer_inputs = infer_inputs , parameters = request . parameters , request_outputs = requested_outputs , ) def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: InvalidInput: If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. \"\"\" infer_inputs = [] raw_inputs = [] for infer_input in self . inputs : if infer_input . data is None and infer_input . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_input . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) if infer_input . data and infer_input . _raw_data : raise InvalidInput ( f \"Both 'data' and 'raw_data' fields are set for input ' { infer_input . name } ' for model ' { self . model_name } '\" ) if infer_input . datatype == \"FP16\" and infer_input . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for input { infer_input . name } \" ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if infer_input . _raw_data : raw_inputs . append ( infer_input . _raw_data ) else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) requested_outputs = [] if self . request_outputs : for requested_output in self . request_outputs : requested_output_dict = { \"name\" : requested_output . name , } if requested_output . parameters : requested_output_dict [ \"parameters\" ] = to_http_parameters ( requested_output . parameters ) requested_outputs . append ( requested_output_dict ) res = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"model_name\" : self . model_name , \"inputs\" : infer_inputs , } if requested_outputs : res [ \"outputs\" ] = requested_outputs if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_inputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_inputs ) return infer_response_bytes , json_length return res , None def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: ModelInferRequest gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) request_outputs = [] if self . request_outputs : for request_output in self . request_outputs : request_output_dict = { \"name\" : request_output . name } if request_output . parameters : request_output_dict [ \"parameters\" ] = to_grpc_parameters ( request_output . parameters ) request_outputs . append ( request_output_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , outputs = request_outputs if request_outputs else None , ) def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) def get_input_by_name ( self , name : str ) -> Optional [ InferInput ]: \"\"\"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists \"\"\" for infer_input in self . inputs : if name == infer_input . name : return infer_input return None def __eq__ ( self , other ): if not isinstance ( other , InferRequest ): return False if self . model_name != other . model_name : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . inputs != other . inputs : return False if self . request_outputs != other . request_outputs : return False return True def to_dict ( self ) -> dict : return { \"id\" : self . id , \"model_name\" : self . model_name , \"inputs\" : [ infer_input . to_dict () for infer_input in self . inputs ], \"parameters\" : self . parameters , \"from_grpc\" : self . from_grpc , } def __repr__ ( self ) -> str : return ( f '\"id\": \" { self . id } \",' f '\"model_name\": \" { self . model_name } \",' f '\"inputs\": { self . inputs . __repr__ () } ,' f '\"parameters\": { self . parameters } ,' f '\"from_grpc\": { self . from_grpc } ' ) def __str__ ( self ) -> str : return self . __repr__ () use_binary_outputs : bool property \u00b6 This attribute is used to determine if all the outputs should be returned as raw binary format. For REST, Get the binary_data_output attribute from the parameters. This will be ovverided by the individual output's 'binary_data' parameter. For GRPC, It is True, if the received inputs are raw_inputs, otherwise False. For GRPC, if the inputs are raw_inputs, then the outputs should be returned as raw_outputs. Returns: Type Description bool a boolean indicating whether to use binary raw outputs __init__ ( model_name , infer_inputs , request_id = None , raw_inputs = None , from_grpc = False , parameters = None , request_outputs = None ) \u00b6 InferRequest Data Model. Parameters: Name Type Description Default model_name str The model name. required infer_inputs List [ InferInput ] The inference inputs for the model. required request_id Optional [ str ] The id for the inference request. None raw_inputs The binary data for the inference inputs. None from_grpc Optional [ bool ] Indicate if the data model is constructed from gRPC request. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None request_outputs Optional [ List [ RequestedOutput ]] The output tensors requested for this inference. None Source code in kserve/protocol/infer_type.py 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , request_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. request_outputs: The output tensors requested for this inference. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc self . _use_raw_outputs = False if raw_inputs : self . _use_raw_outputs = True for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input self . request_outputs = request_outputs as_dataframe () \u00b6 Decode the tensor inputs as pandas dataframe. Returns: Type Description DataFrame The inference input data as pandas dataframe Source code in kserve/protocol/infer_type.py 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) from_bytes ( req_bytes , json_length , model_name ) classmethod \u00b6 The class method to construct the InferRequest object from REST raw request bytes. Parameters: Name Type Description Default req_bytes bytes The raw InferRequest in bytes. required json_length int The length of the json bytes. required model_name str The name of the model. required Returns: Name Type Description InferRequest InferRequest The resulting InferRequest object. Raises: Type Description InvalidInput If the request format is unrecognized or if necessary fields are missing. Source code in kserve/protocol/infer_type.py 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 @classmethod def from_bytes ( cls , req_bytes : bytes , json_length : int , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from REST raw request bytes. Args: req_bytes (bytes): The raw InferRequest in bytes. json_length (int): The length of the json bytes. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized or if necessary fields are missing. \"\"\" json_bytes = req_bytes [: json_length ] try : infer_req_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) infer_inputs = [] # Read the raw binary inputs appended after json start_index = json_length for input_ in infer_req_dict [ \"inputs\" ]: parameters = input_ . get ( \"parameters\" , None ) infer_input = InferInput ( name = input_ [ \"name\" ], shape = input_ [ \"shape\" ], datatype = input_ [ \"datatype\" ], parameters = parameters , ) infer_input_data = input_ . get ( \"data\" , None ) if infer_input_data is not None : if infer_input . datatype == \"FP16\" : raise InvalidInput ( f \"Receiving FP16 data via JSON is not supported. Please use the binary data format \" f \"for input { infer_input . name } \" ) infer_input . data = infer_input_data elif parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" , None ) if binary_data_size is None : raise InvalidInput ( f \"'binary_data_size' is not specified for input ' { infer_input . name } ' for model ' { model_name } '\" ) end_index = start_index + binary_data_size infer_input . _raw_data = req_bytes [ start_index : end_index ] infer_input . set_data_from_numpy ( infer_input . as_numpy (), binary_data = False ) start_index = end_index else : raise InvalidInput ( f \"'data' field is missing for input ' { infer_input . name } ' for model ' { model_name } '\" ) infer_inputs . append ( infer_input ) requested_outputs = None if infer_req_dict . get ( \"outputs\" , None ) is not None : requested_outputs = [ RequestedOutput ( name = output [ \"name\" ], parameters = output . get ( \"parameters\" , None ), ) for output in infer_req_dict [ \"outputs\" ] ] return cls ( model_name = model_name , request_id = infer_req_dict . get ( \"id\" , None ), parameters = infer_req_dict . get ( \"parameters\" , None ), infer_inputs = infer_inputs , request_outputs = requested_outputs , ) from_grpc ( request ) classmethod \u00b6 Class method to construct an InferRequest object from a ModelInferRequest object. Parameters: Name Type Description Default request ModelInferRequest The gRPC ModelInferRequest object to be converted. required Returns: Name Type Description InferRequest InferRequest The resulting InferRequest object. Source code in kserve/protocol/infer_type.py 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 @classmethod def from_grpc ( cls , request : ModelInferRequest ) -> \"InferRequest\" : \"\"\" Class method to construct an InferRequest object from a ModelInferRequest object. Args: request (ModelInferRequest): The gRPC ModelInferRequest object to be converted. Returns: InferRequest: The resulting InferRequest object. \"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = ( to_http_parameters ( input_tensor . parameters ) if input_tensor . parameters else None ), ) for input_tensor in request . inputs ] request_outputs = [ RequestedOutput ( name = output . name , parameters = ( to_http_parameters ( output . parameters ) if output . parameters else None ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , request_outputs = request_outputs if request_outputs else None , ) from_inference_request ( request , model_name ) classmethod \u00b6 The class method to construct the InferRequest object from InferenceRequest object. Parameters: Name Type Description Default request InferenceRequest The InferenceRequest object. required model_name str The name of the model. required Source code in kserve/protocol/infer_type.py 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 @classmethod def from_inference_request ( cls , request : InferenceRequest , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from InferenceRequest object. Args: request (InferenceRequest): The InferenceRequest object. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized. \"\"\" infer_inputs = [] for infer_input in request . inputs : if infer_input . datatype == \"FP16\" and len ( infer_input . data ) != 0 : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. \" f \"Please use the binary data format for input { infer_input . name } \" ) infer_inputs . append ( InferInput ( name = infer_input . name , shape = infer_input . shape , datatype = infer_input . datatype , data = infer_input . data , parameters = ( {} if infer_input . parameters is None else infer_input . parameters ), ) ) requested_outputs = None if request . outputs : requested_outputs = [ RequestedOutput ( name = output . name , parameters = ({} if output . parameters is None else output . parameters ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = model_name , infer_inputs = infer_inputs , parameters = request . parameters , request_outputs = requested_outputs , ) get_input_by_name ( name ) \u00b6 Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists Source code in kserve/protocol/infer_type.py 860 861 862 863 864 865 866 867 868 869 870 871 872 873 def get_input_by_name ( self , name : str ) -> Optional [ InferInput ]: \"\"\"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists \"\"\" for infer_input in self . inputs : if name == infer_input . name : return infer_input return None to_grpc () \u00b6 Converts the InferRequest object to gRPC ModelInferRequest type. Returns: Type Description ModelInferRequest ModelInferRequest gRPC type converted from InferRequest object. Source code in kserve/protocol/infer_type.py 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: ModelInferRequest gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) request_outputs = [] if self . request_outputs : for request_output in self . request_outputs : request_output_dict = { \"name\" : request_output . name } if request_output . parameters : request_output_dict [ \"parameters\" ] = to_grpc_parameters ( request_output . parameters ) request_outputs . append ( request_output_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , outputs = request_outputs if request_outputs else None , ) to_rest () \u00b6 Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Type Description Tuple [ Union [ bytes , Dict ], Optional [ int ]] Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: Type Description InvalidInput If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. Source code in kserve/protocol/infer_type.py 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: InvalidInput: If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. \"\"\" infer_inputs = [] raw_inputs = [] for infer_input in self . inputs : if infer_input . data is None and infer_input . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_input . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) if infer_input . data and infer_input . _raw_data : raise InvalidInput ( f \"Both 'data' and 'raw_data' fields are set for input ' { infer_input . name } ' for model ' { self . model_name } '\" ) if infer_input . datatype == \"FP16\" and infer_input . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for input { infer_input . name } \" ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if infer_input . _raw_data : raw_inputs . append ( infer_input . _raw_data ) else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) requested_outputs = [] if self . request_outputs : for requested_output in self . request_outputs : requested_output_dict = { \"name\" : requested_output . name , } if requested_output . parameters : requested_output_dict [ \"parameters\" ] = to_http_parameters ( requested_output . parameters ) requested_outputs . append ( requested_output_dict ) res = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"model_name\" : self . model_name , \"inputs\" : infer_inputs , } if requested_outputs : res [ \"outputs\" ] = requested_outputs if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_inputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_inputs ) return infer_response_bytes , json_length return res , None InferResponse \u00b6 Source code in kserve/protocol/infer_type.py 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 class InferResponse : id : str model_name : str model_version : Optional [ str ] parameters : Optional [ Dict ] outputs : List [ InferOutput ] from_grpc : bool def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , use_binary_outputs : Optional [ bool ] = False , requested_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. use_binary_outputs: A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. requested_outputs: The output tensors requested for this inference. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc self . _requested_outputs = requested_outputs self . _use_binary_outputs : bool = use_binary_outputs if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type. Args: response: The GRPC response as ModelInferResponse object. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) @classmethod def from_rest ( cls , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type. Args: response: The response as a dict. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = response . get ( \"model_name\" ), model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) @classmethod def from_bytes ( cls , res_bytes : bytes , json_length : int , ) -> \"InferResponse\" : \"\"\" Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Args: res_bytes (bytes): The raw response bytes received from the REST API. json_length (int): The length of the JSON part of the response. Returns: InferResponse: The constructed InferResponse object. Raises: InvalidInput: If the response format is unrecognized or if necessary fields are missing in the response. InferenceError: if failed to set data for the output tensor. \"\"\" # If json_length is equal to the length of the response bytes, then the response does not have # any appended binary data after the json. json_bytes = res_bytes [: json_length ] try : infer_res_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) model_name = infer_res_dict [ \"model_name\" ] infer_outputs = [] # Read the raw binary outputs appended after json start_index = json_length for output in infer_res_dict [ \"outputs\" ]: parameters = output . get ( \"parameters\" , None ) infer_output = InferOutput ( name = output [ \"name\" ], shape = output [ \"shape\" ], datatype = output [ \"datatype\" ], parameters = parameters , ) if parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" ) end_index = start_index + binary_data_size infer_output . _raw_data = res_bytes [ start_index : end_index ] infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = False ) start_index = end_index else : infer_output_data = output . get ( \"data\" , None ) if infer_output_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { model_name } '\" ) infer_output . data = infer_output_data infer_outputs . append ( infer_output ) return cls ( model_name = model_name , response_id = infer_res_dict . get ( \"id\" , None ), parameters = infer_res_dict . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: InvalidInput: If the output data is not a numpy array, bytes, or list. \"\"\" infer_outputs = [] raw_outputs = [] use_binary_data = self . _use_binary_outputs outputs = self . _requested_outputs if self . _requested_outputs else self . outputs for output in outputs : infer_output = ( self . get_output_by_name ( output . name ) if self . _requested_outputs else output ) if self . _requested_outputs : use_binary_data = output . binary_data if infer_output is None : raise InvalidInput ( f \"Unexpected inference output ' { output . name } ' for model ' { self . model_name } '\" ) if infer_output . data is None and infer_output . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = use_binary_data ) elif infer_output . data or infer_output . _raw_data : infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = use_binary_data ) if infer_output . datatype == \"FP16\" and infer_output . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for output { infer_output . name } \" ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if use_binary_data : raw_outputs . append ( infer_output . _raw_data ) else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_outputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_outputs ) return infer_response_bytes , json_length return res , None def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. Raises: InvalidInput: If the output data is not a List or if the datatype is invalid. \"\"\" infer_outputs = [] raw_output_contents = [] use_raw_outputs = self . _use_binary_outputs if not self . _use_binary_outputs : # If FP16 datatype is present in the outputs use raw outputs. if _contains_fp16_datatype ( self ): use_raw_outputs = True for infer_output in self . outputs : if ( use_raw_outputs and infer_output . data and isinstance ( infer_output . data , list ) ): infer_output . data = infer_output . as_numpy () if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) def get_output_by_name ( self , name : str ) -> Optional [ InferOutput ]: \"\"\"Find an output Tensor in the InferResponse that has the given name Args: name : str name of the output Tensor object Returns: InferOutput The InferOutput with the specified name, or None if no output with this name exists \"\"\" for infer_output in self . outputs : if name == infer_output . name : return infer_output return None def __eq__ ( self , other ): if not isinstance ( other , InferResponse ): return False if self . model_name != other . model_name : return False if self . model_version != other . model_version : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . outputs != other . outputs : return False return True def to_dict ( self ) -> dict : return { \"id\" : self . id , \"model_name\" : self . model_name , \"outputs\" : [ infer_output . to_dict () for infer_output in self . outputs ], \"parameters\" : self . parameters , \"from_grpc\" : self . from_grpc , } def __repr__ ( self ) -> str : return ( f '\"id\": \" { self . id } \",' f '\"model_name\": \" { self . model_name } \",' f '\"outputs\": { self . outputs . __repr__ () } ,' f '\"parameters\": { self . parameters } ,' f '\"from_grpc\": { self . from_grpc } ' ) def __str__ ( self ) -> str : return self . __repr__ () __init__ ( response_id , model_name , infer_outputs , model_version = None , raw_outputs = None , from_grpc = False , parameters = None , use_binary_outputs = False , requested_outputs = None ) \u00b6 The InferResponse Data Model Parameters: Name Type Description Default response_id str The id of the inference response. required model_name str The name of the model. required infer_outputs List [ InferOutput ] The inference outputs of the inference response. required model_version Optional [ str ] The version of the model. None raw_outputs The raw binary data of the inference outputs. None from_grpc Optional [ bool ] Indicate if the InferResponse is constructed from a gRPC response. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None use_binary_outputs Optional [ bool ] A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. False requested_outputs Optional [ List [ RequestedOutput ]] The output tensors requested for this inference. None Source code in kserve/protocol/infer_type.py 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , use_binary_outputs : Optional [ bool ] = False , requested_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. use_binary_outputs: A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. requested_outputs: The output tensors requested for this inference. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc self . _requested_outputs = requested_outputs self . _use_binary_outputs : bool = use_binary_outputs if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output from_bytes ( res_bytes , json_length ) classmethod \u00b6 Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Parameters: Name Type Description Default res_bytes bytes The raw response bytes received from the REST API. required json_length int The length of the JSON part of the response. required Returns: Name Type Description InferResponse InferResponse The constructed InferResponse object. Raises: Type Description InvalidInput If the response format is unrecognized or if necessary fields are missing in the response. InferenceError if failed to set data for the output tensor. Source code in kserve/protocol/infer_type.py 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 @classmethod def from_bytes ( cls , res_bytes : bytes , json_length : int , ) -> \"InferResponse\" : \"\"\" Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Args: res_bytes (bytes): The raw response bytes received from the REST API. json_length (int): The length of the JSON part of the response. Returns: InferResponse: The constructed InferResponse object. Raises: InvalidInput: If the response format is unrecognized or if necessary fields are missing in the response. InferenceError: if failed to set data for the output tensor. \"\"\" # If json_length is equal to the length of the response bytes, then the response does not have # any appended binary data after the json. json_bytes = res_bytes [: json_length ] try : infer_res_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) model_name = infer_res_dict [ \"model_name\" ] infer_outputs = [] # Read the raw binary outputs appended after json start_index = json_length for output in infer_res_dict [ \"outputs\" ]: parameters = output . get ( \"parameters\" , None ) infer_output = InferOutput ( name = output [ \"name\" ], shape = output [ \"shape\" ], datatype = output [ \"datatype\" ], parameters = parameters , ) if parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" ) end_index = start_index + binary_data_size infer_output . _raw_data = res_bytes [ start_index : end_index ] infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = False ) start_index = end_index else : infer_output_data = output . get ( \"data\" , None ) if infer_output_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { model_name } '\" ) infer_output . data = infer_output_data infer_outputs . append ( infer_output ) return cls ( model_name = model_name , response_id = infer_res_dict . get ( \"id\" , None ), parameters = infer_res_dict . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) from_grpc ( response ) classmethod \u00b6 The class method to construct the InferResponse object from gRPC message type. Parameters: Name Type Description Default response ModelInferResponse The GRPC response as ModelInferResponse object. required Source code in kserve/protocol/infer_type.py 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type. Args: response: The GRPC response as ModelInferResponse object. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) from_rest ( response ) classmethod \u00b6 The class method to construct the InferResponse object from REST message type. Parameters: Name Type Description Default response Dict The response as a dict. required Source code in kserve/protocol/infer_type.py 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 @classmethod def from_rest ( cls , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type. Args: response: The response as a dict. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = response . get ( \"model_name\" ), model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) get_output_by_name ( name ) \u00b6 Find an output Tensor in the InferResponse that has the given name Parameters: Name Type Description Default name str name of the output Tensor object required Source code in kserve/protocol/infer_type.py 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 def get_output_by_name ( self , name : str ) -> Optional [ InferOutput ]: \"\"\"Find an output Tensor in the InferResponse that has the given name Args: name : str name of the output Tensor object Returns: InferOutput The InferOutput with the specified name, or None if no output with this name exists \"\"\" for infer_output in self . outputs : if name == infer_output . name : return infer_output return None to_grpc () \u00b6 Converts the InferResponse object to gRPC ModelInferResponse type. Returns: Type Description ModelInferResponse The ModelInferResponse gRPC message. Source code in kserve/protocol/infer_type.py 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. Raises: InvalidInput: If the output data is not a List or if the datatype is invalid. \"\"\" infer_outputs = [] raw_output_contents = [] use_raw_outputs = self . _use_binary_outputs if not self . _use_binary_outputs : # If FP16 datatype is present in the outputs use raw outputs. if _contains_fp16_datatype ( self ): use_raw_outputs = True for infer_output in self . outputs : if ( use_raw_outputs and infer_output . data and isinstance ( infer_output . data , list ) ): infer_output . data = infer_output . as_numpy () if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) to_rest () \u00b6 Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Type Description Tuple [ Union [ bytes , Dict ], Optional [ int ]] Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: Type Description InvalidInput If the output data is not a numpy array, bytes, or list. Source code in kserve/protocol/infer_type.py 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: InvalidInput: If the output data is not a numpy array, bytes, or list. \"\"\" infer_outputs = [] raw_outputs = [] use_binary_data = self . _use_binary_outputs outputs = self . _requested_outputs if self . _requested_outputs else self . outputs for output in outputs : infer_output = ( self . get_output_by_name ( output . name ) if self . _requested_outputs else output ) if self . _requested_outputs : use_binary_data = output . binary_data if infer_output is None : raise InvalidInput ( f \"Unexpected inference output ' { output . name } ' for model ' { self . model_name } '\" ) if infer_output . data is None and infer_output . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = use_binary_data ) elif infer_output . data or infer_output . _raw_data : infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = use_binary_data ) if infer_output . datatype == \"FP16\" and infer_output . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for output { infer_output . name } \" ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if use_binary_data : raw_outputs . append ( infer_output . _raw_data ) else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_outputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_outputs ) return infer_response_bytes , json_length return res , None RequestedOutput \u00b6 Source code in kserve/protocol/infer_type.py 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 class RequestedOutput : def __init__ ( self , name : str , parameters : Optional [ Dict ] = None ): \"\"\" The RequestedOutput class represents an output that is requested as part of an inference request. Args: name (str): The name of the output. parameters (Optional[Dict]): Additional parameters for the output. \"\"\" self . _name = name self . _parameters = parameters @property def name ( self ) -> str : \"\"\" Get the name of the output. Returns: str: The name of the output. \"\"\" return self . _name @property def parameters ( self ) -> Optional [ Dict ]: \"\"\" Get the additional parameters for the output. Returns: Optional[Dict]: The additional parameters for the output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] ): \"\"\"Set the parameters of the inference input associated with this object. Args: params: parameters of the inference input \"\"\" self . _parameters = params @property def binary_data ( self ) -> Optional [ bool ]: \"\"\"Get the binary_data attribute from the parameters. This attribute indicates whether the data for the input should be in binary format. Returns: bool or None: True if the data should be in binary format, False otherwise. If the binary_data attribute is not set, returns None. \"\"\" if self . parameters and \"binary_data\" in self . parameters : return self . parameters [ \"binary_data\" ] else : return None def __eq__ ( self , other ): if not isinstance ( other , RequestedOutput ): return False if self . name != other . name : return False if self . parameters != other . parameters : return False return True def __repr__ ( self ): return f \"RequestedOutput(name= { self . name } , parameters= { self . parameters } )\" def __str__ ( self ): return self . __repr__ () binary_data : Optional [ bool ] property \u00b6 Get the binary_data attribute from the parameters. This attribute indicates whether the data for the input should be in binary format. Returns: Type Description Optional [ bool ] bool or None: True if the data should be in binary format, False otherwise. If the binary_data attribute is not set, returns None. name : str property \u00b6 Get the name of the output. Returns: Name Type Description str str The name of the output. parameters : Optional [ Dict ] property writable \u00b6 Get the additional parameters for the output. Returns: Type Description Optional [ Dict ] Optional[Dict]: The additional parameters for the output. __init__ ( name , parameters = None ) \u00b6 The RequestedOutput class represents an output that is requested as part of an inference request. Parameters: Name Type Description Default name str The name of the output. required parameters Optional [ Dict ] Additional parameters for the output. None Source code in kserve/protocol/infer_type.py 408 409 410 411 412 413 414 415 416 417 def __init__ ( self , name : str , parameters : Optional [ Dict ] = None ): \"\"\" The RequestedOutput class represents an output that is requested as part of an inference request. Args: name (str): The name of the output. parameters (Optional[Dict]): Additional parameters for the output. \"\"\" self . _name = name self . _parameters = parameters deserialize_bytes_tensor ( encoded_tensor ) \u00b6 Deserializes an encoded bytes tensor into a numpy array of dtype of python objects Parameters: Name Type Description Default encoded_tensor bytes The encoded bytes tensor where each element has its length in first 4 bytes followed by the content required Source code in kserve/protocol/infer_type.py 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 def deserialize_bytes_tensor ( encoded_tensor : bytes ) -> np . ndarray : \"\"\" Deserializes an encoded bytes tensor into a numpy array of dtype of python objects Args: encoded_tensor : bytes The encoded bytes tensor where each element has its length in first 4 bytes followed by the content Returns: string_tensor : np.array The 1-D numpy array of type object containing the deserialized bytes in row-major form. \"\"\" strs = list () offset = 0 val_buf = encoded_tensor while offset < len ( val_buf ): length = struct . unpack_from ( \"<I\" , val_buf , offset )[ 0 ] offset += 4 sb = struct . unpack_from ( \"< {} s\" . format ( length ), val_buf , offset )[ 0 ] offset += length strs . append ( sb ) return np . array ( strs , dtype = np . object_ ) serialize_byte_tensor ( input_tensor ) \u00b6 Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object. For np.bytes, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Parameters: Name Type Description Default input_tensor np.array The bytes tensor to serialize. required Source code in kserve/protocol/infer_type.py 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 def serialize_byte_tensor ( input_tensor : np . ndarray ) -> np . ndarray : \"\"\" Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object. For np.bytes, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Args: input_tensor : np.array The bytes tensor to serialize. Returns: serialized_bytes_tensor : np.array The 1-D numpy array of type uint8 containing the serialized bytes in row-major form. Raises: InferenceError If unable to serialize the given tensor. \"\"\" if input_tensor . size == 0 : return np . empty ([ 0 ], dtype = np . object_ ) # If the input is a tensor of string/bytes objects, then must flatten those into # a 1-dimensional array containing the 4-byte byte size followed by the # actual element bytes. All elements are concatenated together in row-major # order. if ( input_tensor . dtype != np . object_ ) and ( input_tensor . dtype . type != np . bytes_ ): raise InferenceError ( \"cannot serialize bytes tensor: invalid datatype\" ) flattened_ls = [] # 'C' order is row-major. for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # If directly passing bytes to BYTES type, # don't convert it to str as Python will encode the # bytes which may distort the meaning if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : s = obj . item () else : s = str ( obj . item ()) . encode ( \"utf-8\" ) else : s = obj . item () flattened_ls . append ( struct . pack ( \"<I\" , len ( s ))) flattened_ls . append ( s ) flattened = b \"\" . join ( flattened_ls ) flattened_array = np . asarray ( flattened , dtype = np . object_ ) if not flattened_array . flags [ \"C_CONTIGUOUS\" ]: flattened_array = np . ascontiguousarray ( flattened_array , dtype = np . object_ ) return flattened_array to_grpc_parameters ( parameters ) \u00b6 Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. Source code in kserve/protocol/infer_type.py 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 def to_grpc_parameters ( parameters : Union [ Dict [ str , Union [ str , bool , int ]], MessageMap [ str , InferParameter ]] ) -> Dict [ str , InferParameter ]: \"\"\" Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. \"\"\" grpc_params : Dict [ str , InferParameter ] = {} for key , val in parameters . items (): if isinstance ( val , str ): grpc_params [ key ] = InferParameter ( string_param = val ) elif isinstance ( val , bool ): grpc_params [ key ] = InferParameter ( bool_param = val ) elif isinstance ( val , int ): grpc_params [ key ] = InferParameter ( int64_param = val ) elif isinstance ( val , InferParameter ): grpc_params [ key ] = val else : raise InvalidInput ( f \"to_grpc: invalid parameter value: { val } \" ) return grpc_params to_http_parameters ( parameters ) \u00b6 Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] Source code in kserve/protocol/infer_type.py 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 def to_http_parameters ( parameters : Union [ dict , MessageMap [ str , InferParameter ]] ) -> Dict [ str , Union [ str , bool , int ]]: \"\"\" Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] \"\"\" http_params : Dict [ str , Union [ str , bool , int ]] = {} for key , val in parameters . items (): if isinstance ( val , InferParameter ): if val . HasField ( \"bool_param\" ): http_params [ key ] = val . bool_param elif val . HasField ( \"int64_param\" ): http_params [ key ] = val . int64_param elif val . HasField ( \"string_param\" ): http_params [ key ] = val . string_param else : http_params [ key ] = val return http_params","title":"KServe Python Serving Runtime API"},{"location":"python_runtime_api/docs/api/#kserve-python-serving-runtime-api","text":"","title":"KServe Python Serving Runtime API"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer","text":"Source code in kserve/model_server.py 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 class ModelServer : def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : Optional [ ModelRepository ] = None , enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: A optional Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None``. it allows to override only the `uvicorn.access`'s format configuration with a richer set of fields (output hardcoded to `stdout`). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn [github issue](https://github.com/encode/uvicorn/issues/527) for more info). \"\"\" self . registered_models = ( ModelRepository () if registered_models is None else registered_models ) self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = self . registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension , kwargs = vars ( args ), ) if args . configure_logging : # If the logger does not have any handlers, then the logger is not configured. # For backward compatibility, we configure the logger here. if len ( logger . handlers ) == 0 : logging . configure_logging ( args . log_config_file ) self . access_log_format = access_log_format self . _custom_exception_handler = None async def _serve_rest ( self ): logger . info ( f \"Starting uvicorn with { self . workers } workers\" ) loop = asyncio . get_event_loop () if sys . platform not in [ \"win32\" , \"win64\" ]: sig_list = [ signal . SIGINT , signal . SIGTERM , signal . SIGQUIT ] else : sig_list = [ signal . SIGINT , signal . SIGTERM ] for sig in sig_list : loop . add_signal_handler ( sig , lambda s = sig : asyncio . create_task ( self . stop ( sig = s )) ) if self . _custom_exception_handler is None : loop . set_exception_handler ( self . default_exception_handler ) else : loop . set_exception_handler ( self . _custom_exception_handler ) self . _rest_server = UvicornServer ( app , self . http_port , self . dataplane , self . model_repository_extension , # By setting log_config to None we tell Uvicorn not to configure logging as it is already # configured by kserve. log_config = None , access_log_format = self . access_log_format , workers = self . workers , ) await self . _rest_server . run () def start ( self , models : List [ BaseKServeModel ]) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): at_least_one_model_ready = False for model in models : if isinstance ( model , BaseKServeModel ): if model . ready : at_least_one_model_ready = True self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging model . start () else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) if not at_least_one_model_ready and models : raise NoModelReady ( models ) else : raise RuntimeError ( \"Unknown model collection type\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def servers_task (): servers = [ self . _serve_rest ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" if \"exception\" in context : logger . error ( f \"Caught exception: { context . get ( 'exception' ) } \" ) logger . error ( f \"message: { context . get ( 'message' ) } \" ) loop . default_exception_handler ( context ) def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name )","title":"ModelServer"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.__init__","text":"KServe ModelServer Constructor Parameters: Name Type Description Default http_port int HTTP port. Default: 8080 . http_port grpc_port int GRPC port. Default: 8081 . grpc_port workers int Number of uvicorn workers. Default: 1 . workers max_threads int Max number of gRPC processing threads. Default: 4 max_threads max_asyncio_workers int Max number of AsyncIO threads. Default: None max_asyncio_workers registered_models Optional [ ModelRepository ] A optional Model repository with registered models. None enable_grpc bool Whether to turn on grpc server. Default: True enable_grpc enable_docs_url bool Whether to turn on /docs Swagger UI. Default: False . enable_docs_url enable_latency_logging bool Whether to log latency metric. Default: True . enable_latency_logging access_log_format str Format to set for the access log (provided by asgi-logger). Default: None . it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). access_log_format Source code in kserve/model_server.py 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : Optional [ ModelRepository ] = None , enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: A optional Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None``. it allows to override only the `uvicorn.access`'s format configuration with a richer set of fields (output hardcoded to `stdout`). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn [github issue](https://github.com/encode/uvicorn/issues/527) for more info). \"\"\" self . registered_models = ( ModelRepository () if registered_models is None else registered_models ) self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = self . registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension , kwargs = vars ( args ), ) if args . configure_logging : # If the logger does not have any handlers, then the logger is not configured. # For backward compatibility, we configure the logger here. if len ( logger . handlers ) == 0 : logging . configure_logging ( args . log_config_file ) self . access_log_format = access_log_format self . _custom_exception_handler = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.default_exception_handler","text":"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. Source code in kserve/model_server.py 341 342 343 344 345 346 347 348 349 350 351 352 def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" if \"exception\" in context : logger . error ( f \"Caught exception: { context . get ( 'exception' ) } \" ) logger . error ( f \"message: { context . get ( 'message' ) } \" ) loop . default_exception_handler ( context )","title":"default_exception_handler"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.register_exception_handler","text":"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see call_exception_handler() documentation for details about context). Source code in kserve/model_server.py 327 328 329 330 331 332 333 334 335 336 337 338 339 def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler","title":"register_exception_handler"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.register_model","text":"Register a model to the model server. Parameters: Name Type Description Default model BaseKServeModel The model object. required Source code in kserve/model_server.py 354 355 356 357 358 359 360 361 362 363 def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name )","title":"register_model"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.start","text":"Start the model server with a set of registered models. Parameters: Name Type Description Default models List [ BaseKServeModel ] a list of models to register to the model server. required Source code in kserve/model_server.py 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 def start ( self , models : List [ BaseKServeModel ]) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): at_least_one_model_ready = False for model in models : if isinstance ( model , BaseKServeModel ): if model . ready : at_least_one_model_ready = True self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging model . start () else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) if not at_least_one_model_ready and models : raise NoModelReady ( models ) else : raise RuntimeError ( \"Unknown model collection type\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def servers_task (): servers = [ self . _serve_rest ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ())","title":"start"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.stop","text":"Stop the instances of REST and gRPC model servers. Parameters: Name Type Description Default sig Optional [ int ] The signal to stop the server. Default: None . None Source code in kserve/model_server.py 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name )","title":"stop"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel","text":"Bases: ABC A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. Source code in kserve/model.py 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 class BaseKServeModel ( ABC ): \"\"\" A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. \"\"\" def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False async def healthy ( self ) -> bool : \"\"\" Check the health of this model. By default returns `self.ready`. Returns: True if healthy, false otherwise \"\"\" return self . ready def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready def start ( self ): \"\"\"Start handler can be overridden to perform model setup\"\"\" self . ready = True def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" self . ready = False","title":"BaseKServeModel"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.__init__","text":"Adds the required attributes Parameters: Name Type Description Default name str The name of the model. required Source code in kserve/model.py 49 50 51 52 53 54 55 56 57 def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.healthy","text":"Check the health of this model. By default returns self.ready . Returns: Type Description bool True if healthy, false otherwise Source code in kserve/model.py 59 60 61 62 63 64 65 66 async def healthy ( self ) -> bool : \"\"\" Check the health of this model. By default returns `self.ready`. Returns: True if healthy, false otherwise \"\"\" return self . ready","title":"healthy"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.load","text":"Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 68 69 70 71 72 73 74 75 76 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready","title":"load"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.start","text":"Start handler can be overridden to perform model setup Source code in kserve/model.py 78 79 80 def start ( self ): \"\"\"Start handler can be overridden to perform model setup\"\"\" self . ready = True","title":"start"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.stop","text":"Stop handler can be overridden to perform model teardown Source code in kserve/model.py 82 83 84 def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" self . ready = False","title":"stop"},{"location":"python_runtime_api/docs/api/#kserve.model.InferenceModel","text":"Bases: BaseKServeModel Abstract class representing a model that supports standard inference and prediction. Source code in kserve/model.py 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 class InferenceModel ( BaseKServeModel ): \"\"\" Abstract class representing a model that supports standard inference and prediction. \"\"\" @abstractmethod def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : pass def get_input_types ( self ) -> List [ Dict ]: # Override this function to return appropriate input format expected by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] def get_output_types ( self ) -> List [ Dict ]: # Override this function to return appropriate output format returned by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return []","title":"InferenceModel"},{"location":"python_runtime_api/docs/api/#kserve.model.Model","text":"Bases: InferenceModel Source code in kserve/model.py 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 class Model ( InferenceModel ): def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response @property def _http_client ( self ) -> InferenceRESTClient : if self . _http_client_instance is None and self . predictor_host : config = RESTConfig ( protocol = self . protocol , timeout = self . timeout , retries = 3 ) self . _http_client_instance = InferenceRESTClient ( config = config ) return self . _http_client_instance @property def _grpc_client ( self ) -> InferenceGRPCClient : if self . _grpc_client_stub is None and self . predictor_host : self . _grpc_client_stub = InferenceGRPCClient ( url = self . predictor_host , use_ssl = self . use_ssl , timeout = self . timeout ) return self . _grpc_client_stub def validate ( self , payload ): if isinstance ( payload , ModelInferRequest ): return payload if isinstance ( payload , InferRequest ): return payload # TODO: validate the request if self.get_input_types() defines the input types. if self . protocol == PredictorProtocol . REST_V2 . value : if \"inputs\" in payload and not isinstance ( payload [ \"inputs\" ], list ): raise InvalidInput ( 'Expected \"inputs\" to be a list' ) elif self . protocol == PredictorProtocol . REST_V1 . value : if ( isinstance ( payload , Dict ) and \"instances\" in payload and not isinstance ( payload [ \"instances\" ], list ) ): raise InvalidInput ( 'Expected \"instances\" to be a list' ) return payload def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result async def _http_predict ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: # Adjusting headers. Inject content type if not exist. # Also, removing host, as the header is the one passed to transformer and contains transformer's host predict_headers = { \"Content-Type\" : \"application/json\" } if headers is not None : if \"x-request-id\" in headers : predict_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : predict_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" predict_base_url = PREDICTOR_BASE_URL_FORMAT . format ( protocol , self . predictor_host ) response = await self . _http_client . infer ( predict_base_url , model_name = self . name , data = payload , headers = predict_headers , ) return response async def _grpc_predict ( self , payload : Union [ ModelInferRequest , InferRequest ], headers : Dict [ str , str ] = None , ) -> InferResponse : if isinstance ( payload , ModelInferRequest ): payload = InferRequest . from_grpc ( payload ) async_result = await self . _grpc_client . infer ( infer_request = payload , headers = ( ( \"request_type\" , \"grpc_v2\" ), ( \"response_type\" , \"grpc_v2\" ), ( \"x-request-id\" , headers . get ( \"x-request-id\" , \"\" )), ), ) return async_result async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : return await self . _grpc_predict ( payload , headers ) else : return await self . _http_predict ( payload , headers ) async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) explain_headers = { \"content-type\" : \"application/json\" } if headers is not None : if \"content-type\" in headers : explain_headers [ \"content-type\" ] = headers [ \"content-type\" ] if \"x-request-id\" in headers : explain_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : explain_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_base_url = EXPLAINER_BASE_URL_FORMAT . format ( protocol , self . explainer_host ) response = await self . _http_client . explain ( explain_base_url , model_name = self . name , data = payload , headers = explain_headers , ) return response","title":"Model"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.__call__","text":"Method to call predictor or explainer with the given input. Parameters: Name Type Description Default body Union [ Dict , CloudEvent , InferRequest ] Request body. required verb InferenceVerb The inference verb for predict/generate/explain PREDICT headers Optional [ Dict [ str , str ]] Request headers. None Returns: Type Description InferReturnType Response output from preprocess -> predict/generate/explain -> postprocess Source code in kserve/model.py 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], headers : Optional [ Dict [ str , str ]] = None , verb : InferenceVerb = InferenceVerb . PREDICT , ) -> InferReturnType : \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response","title":"__call__"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.__init__","text":"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Parameters: Name Type Description Default name str The name of the model. required predictor_config Optional [ PredictorConfig ] The configurations for http call to the predictor. None Source code in kserve/model.py 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.explain","text":"explain handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if explainer_host is specified. Parameters: Name Type Description Default payload Dict Explainer model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Dict An Explanation for the inference result. Source code in kserve/model.py 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) explain_headers = { \"content-type\" : \"application/json\" } if headers is not None : if \"content-type\" in headers : explain_headers [ \"content-type\" ] = headers [ \"content-type\" ] if \"x-request-id\" in headers : explain_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : explain_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_base_url = EXPLAINER_BASE_URL_FORMAT . format ( protocol , self . explainer_host ) response = await self . _http_client . explain ( explain_base_url , model_name = self . name , data = payload , headers = explain_headers , ) return response","title":"explain"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.load","text":"Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 298 299 300 301 302 303 304 305 306 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready","title":"load"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.postprocess","text":"The postprocess handler can be overridden for inference result or response transformation. The predictor sends back the inference result in Dict for v1 endpoints and InferResponse for v2 endpoints. Parameters: Name Type Description Default result Union [ Dict , InferResponse ] The inference result passed from predict handler or the HTTP response from predictor. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse ] A Dict or InferResponse after post-process to return back to the client. Source code in kserve/model.py 325 326 327 328 329 330 331 332 333 334 335 336 337 338 async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result","title":"postprocess"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.predict","text":"The predict handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Parameters: Name Type Description Default payload Union [ Dict , InferRequest , ModelInferRequest ] Model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse , AsyncIterator [ Any ]] Inference result or a Response from the predictor. Source code in kserve/model.py 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : return await self . _grpc_predict ( payload , headers ) else : return await self . _http_predict ( payload , headers )","title":"predict"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.preprocess","text":"preprocess handler can be overridden for data or feature transformation. The model decodes the request body to Dict for v1 endpoints and InferRequest for v2 endpoints. Parameters: Name Type Description Default payload Union [ Dict , InferRequest ] Payload of the request. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferRequest ] A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Union [ Dict , InferRequest ] Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. Source code in kserve/model.py 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload","title":"preprocess"},{"location":"python_runtime_api/docs/api/#kserve.model.PredictorConfig","text":"Source code in kserve/model.py 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 class PredictorConfig : def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds","title":"PredictorConfig"},{"location":"python_runtime_api/docs/api/#kserve.model.PredictorConfig.__init__","text":"The configuration for the http call to the predictor Parameters: Name Type Description Default predictor_host str The host name of the predictor required predictor_protocol str The inference protocol used for predictor http call REST_V1 .value predictor_use_ssl bool Enable using ssl for http connection to the predictor False predictor_request_timeout_seconds int The request timeout seconds for the predictor http call 600 Source code in kserve/model.py 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput","text":"Source code in kserve/protocol/infer_type.py 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 class InferInput : _name : str _shape : List [ int ] _datatype : str _parameters : Dict def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference input associated with this object. Returns: The name of the inference input \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the datatype of inference input associated with this object. Returns: The datatype of the inference input. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of the inference input associated with this object. Returns: The data of the inference input. \"\"\" return self . _data @data . setter def data ( self , data : List ): \"\"\"Set the data of the inference input associated with this object. Args: data: data of the inference input. \"\"\" self . _data = data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference input associated with this object. Returns: The shape of the inference input. \"\"\" return self . _shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of the inference input associated with this object. Returns: The additional inference parameters \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] ): \"\"\"Set the parameters of the inference input associated with this object. Args: params: parameters of the inference input \"\"\" self . _parameters = params @shape . setter def shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference input. Args: shape : The shape of the associated inference input. \"\"\" self . _shape = shape def as_string ( self ) -> List [ List [ str ]]: \"\"\" Decodes the inference input data as a list of strings. Returns: List[List[str]]: The decoded data as a list of strings. Raises: InvalidInput: If the datatype of the inference input is not 'BYTES'. \"\"\" if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" ) def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data Raises: InvalidInput: If the datatype of the inference input is not recognized. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferInput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False if self . _raw_data != other . _raw_data : return False return True @parameters . setter def parameters ( self , value ): self . _parameters = value def to_dict ( self ) -> dict : return { \"name\" : self . name , \"shape\" : self . shape , \"datatype\" : self . datatype , \"data\" : self . data , \"parameters\" : self . parameters , } def __repr__ ( self ) -> str : return ( f '\"name\": \" { self . name } \",' f '\"shape\": { self . shape } ,' f '\"datatype\": \" { self . datatype } \",' f '\"data\": { self . data } ,' f '\"parameters\": { self . parameters } ' ) def __str__ ( self ) -> str : return self . __repr__ ()","title":"InferInput"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.data","text":"Get the data of the inference input associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of the inference input.","title":"data"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.datatype","text":"Get the datatype of inference input associated with this object. Returns: Type Description str The datatype of the inference input.","title":"datatype"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.name","text":"Get the name of inference input associated with this object. Returns: Type Description str The name of the inference input","title":"name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.parameters","text":"Get the parameters of the inference input associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters","title":"parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.shape","text":"Get the shape of inference input associated with this object. Returns: Type Description List [ int ] The shape of the inference input.","title":"shape"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.__init__","text":"An object of InferInput class is used to describe the input tensor of an inference request. Parameters: Name Type Description Default name str The name of the inference input whose data will be described by this object. required shape The shape of the associated inference input. required datatype The data type of the associated inference input. required data The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using set_data_from_numpy . None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.as_numpy","text":"Decode the inference input data as numpy array. Returns: Type Description ndarray A numpy array of the inference input data Source code in kserve/protocol/infer_type.py 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data Raises: InvalidInput: If the datatype of the inference input is not recognized. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape )","title":"as_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.as_string","text":"Decodes the inference input data as a list of strings. Returns: Type Description List [ List [ str ]] List[List[str]]: The decoded data as a list of strings. Raises: Type Description InvalidInput If the datatype of the inference input is not 'BYTES'. Source code in kserve/protocol/infer_type.py 220 221 222 223 224 225 226 227 228 229 230 231 232 233 def as_string ( self ) -> List [ List [ str ]]: \"\"\" Decodes the inference input data as a list of strings. Returns: List[List[str]]: The decoded data as a list of strings. Raises: InvalidInput: If the datatype of the inference input is not 'BYTES'. \"\"\" if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" )","title":"as_string"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.set_data_from_numpy","text":"Set the tensor data from the specified numpy array for input associated with this object. Parameters: Name Type Description Default input_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data )","title":"set_data_from_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput","text":"Source code in kserve/protocol/infer_type.py 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 class InferOutput : def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference output associated with this object. Returns: The name of inference output. \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the data type of inference output associated with this object. Returns: The data type of inference output. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of inference output associated with this object. Returns: The data of inference output. \"\"\" return self . _data @data . setter def data ( self , data : Union [ List , np . ndarray , InferTensorContents ]): \"\"\"Set the data of inference output associated with this object. Args: data: inference output data. \"\"\" self . _data = data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference output associated with this object. Returns: The shape of inference output \"\"\" return self . _shape @shape . setter def shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference output. Args: shape: The shape of the associated inference output. \"\"\" self . _shape = shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of inference output associated with this object. Returns: The additional inference parameters associated with the inference output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Union [ Dict , MessageMap [ str , InferParameter ]]): \"\"\"Set the parameters of inference output associated with this object. :param params: The parameters of inference output \"\"\" self . _parameters = params def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferOutput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False if self . _raw_data != other . _raw_data : return False return True def to_dict ( self ) -> dict : return { \"name\" : self . name , \"shape\" : self . shape , \"datatype\" : self . datatype , \"data\" : self . data , \"parameters\" : self . parameters , } def __repr__ ( self ) -> str : return ( f '\"name\": \" { self . name } \",' f '\"shape\": { self . shape } ,' f '\"datatype\": \" { self . datatype } \",' f '\"data\": { self . data } ,' f '\"parameters\": { self . parameters } ' ) def __str__ ( self ) -> str : return self . __repr__ ()","title":"InferOutput"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.data","text":"Get the data of inference output associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of inference output.","title":"data"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.datatype","text":"Get the data type of inference output associated with this object. Returns: Type Description str The data type of inference output.","title":"datatype"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.name","text":"Get the name of inference output associated with this object. Returns: Type Description str The name of inference output.","title":"name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.parameters","text":"Get the parameters of inference output associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters associated with the inference output.","title":"parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.shape","text":"Get the shape of inference output associated with this object. Returns: Type Description List [ int ] The shape of inference output","title":"shape"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.__init__","text":"An object of InferOutput class is used to describe the output tensor for an inference response. Parameters: Name Type Description Default name The name of inference output whose data will be described by this object. required shape The shape of the associated inference output. required datatype The data type of the associated inference output. required data The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.as_numpy","text":"Decode the tensor output data as numpy array. Returns: Type Description ndarray The numpy array of the associated inference output data. Source code in kserve/protocol/infer_type.py 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : if self . datatype == \"BYTES\" : # String results contain a 4-byte string length # followed by the actual string characters. Hence, # need to decode the raw bytes to convert into # array elements. np_array = deserialize_bytes_tensor ( self . _raw_data ) else : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape )","title":"as_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.set_data_from_numpy","text":"Set the tensor data from the specified numpy array for the inference output associated with this object. Parameters: Name Type Description Default output_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data )","title":"set_data_from_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest","text":"Source code in kserve/protocol/infer_type.py 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 class InferRequest : id : Optional [ str ] model_name : str parameters : Optional [ Dict ] inputs : List [ InferInput ] from_grpc : bool def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , request_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. request_outputs: The output tensors requested for this inference. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc self . _use_raw_outputs = False if raw_inputs : self . _use_raw_outputs = True for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input self . request_outputs = request_outputs @property def use_binary_outputs ( self ) -> bool : \"\"\"This attribute is used to determine if all the outputs should be returned as raw binary format. For REST, Get the binary_data_output attribute from the parameters. This will be ovverided by the individual output's 'binary_data' parameter. For GRPC, It is True, if the received inputs are raw_inputs, otherwise False. For GRPC, if the inputs are raw_inputs, then the outputs should be returned as raw_outputs. Returns: a boolean indicating whether to use binary raw outputs \"\"\" # If the request is from gRPC and we receive the inputs as raw inputs, then the outputs should be returned as raw binary format. if self . _use_raw_outputs and self . from_grpc : return True # If the request is from REST and the 'use_binary_outputs' parameter is set to True, then the outputs should be returned as raw binary format. elif self . parameters and not self . from_grpc : # If it is a grpc request, then this configuration has no effect on the output. return self . parameters . get ( \"binary_data_output\" , False ) else : return False @classmethod def from_grpc ( cls , request : ModelInferRequest ) -> \"InferRequest\" : \"\"\" Class method to construct an InferRequest object from a ModelInferRequest object. Args: request (ModelInferRequest): The gRPC ModelInferRequest object to be converted. Returns: InferRequest: The resulting InferRequest object. \"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = ( to_http_parameters ( input_tensor . parameters ) if input_tensor . parameters else None ), ) for input_tensor in request . inputs ] request_outputs = [ RequestedOutput ( name = output . name , parameters = ( to_http_parameters ( output . parameters ) if output . parameters else None ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , request_outputs = request_outputs if request_outputs else None , ) @classmethod def from_bytes ( cls , req_bytes : bytes , json_length : int , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from REST raw request bytes. Args: req_bytes (bytes): The raw InferRequest in bytes. json_length (int): The length of the json bytes. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized or if necessary fields are missing. \"\"\" json_bytes = req_bytes [: json_length ] try : infer_req_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) infer_inputs = [] # Read the raw binary inputs appended after json start_index = json_length for input_ in infer_req_dict [ \"inputs\" ]: parameters = input_ . get ( \"parameters\" , None ) infer_input = InferInput ( name = input_ [ \"name\" ], shape = input_ [ \"shape\" ], datatype = input_ [ \"datatype\" ], parameters = parameters , ) infer_input_data = input_ . get ( \"data\" , None ) if infer_input_data is not None : if infer_input . datatype == \"FP16\" : raise InvalidInput ( f \"Receiving FP16 data via JSON is not supported. Please use the binary data format \" f \"for input { infer_input . name } \" ) infer_input . data = infer_input_data elif parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" , None ) if binary_data_size is None : raise InvalidInput ( f \"'binary_data_size' is not specified for input ' { infer_input . name } ' for model ' { model_name } '\" ) end_index = start_index + binary_data_size infer_input . _raw_data = req_bytes [ start_index : end_index ] infer_input . set_data_from_numpy ( infer_input . as_numpy (), binary_data = False ) start_index = end_index else : raise InvalidInput ( f \"'data' field is missing for input ' { infer_input . name } ' for model ' { model_name } '\" ) infer_inputs . append ( infer_input ) requested_outputs = None if infer_req_dict . get ( \"outputs\" , None ) is not None : requested_outputs = [ RequestedOutput ( name = output [ \"name\" ], parameters = output . get ( \"parameters\" , None ), ) for output in infer_req_dict [ \"outputs\" ] ] return cls ( model_name = model_name , request_id = infer_req_dict . get ( \"id\" , None ), parameters = infer_req_dict . get ( \"parameters\" , None ), infer_inputs = infer_inputs , request_outputs = requested_outputs , ) @classmethod def from_inference_request ( cls , request : InferenceRequest , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from InferenceRequest object. Args: request (InferenceRequest): The InferenceRequest object. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized. \"\"\" infer_inputs = [] for infer_input in request . inputs : if infer_input . datatype == \"FP16\" and len ( infer_input . data ) != 0 : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. \" f \"Please use the binary data format for input { infer_input . name } \" ) infer_inputs . append ( InferInput ( name = infer_input . name , shape = infer_input . shape , datatype = infer_input . datatype , data = infer_input . data , parameters = ( {} if infer_input . parameters is None else infer_input . parameters ), ) ) requested_outputs = None if request . outputs : requested_outputs = [ RequestedOutput ( name = output . name , parameters = ({} if output . parameters is None else output . parameters ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = model_name , infer_inputs = infer_inputs , parameters = request . parameters , request_outputs = requested_outputs , ) def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: InvalidInput: If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. \"\"\" infer_inputs = [] raw_inputs = [] for infer_input in self . inputs : if infer_input . data is None and infer_input . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_input . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) if infer_input . data and infer_input . _raw_data : raise InvalidInput ( f \"Both 'data' and 'raw_data' fields are set for input ' { infer_input . name } ' for model ' { self . model_name } '\" ) if infer_input . datatype == \"FP16\" and infer_input . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for input { infer_input . name } \" ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if infer_input . _raw_data : raw_inputs . append ( infer_input . _raw_data ) else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) requested_outputs = [] if self . request_outputs : for requested_output in self . request_outputs : requested_output_dict = { \"name\" : requested_output . name , } if requested_output . parameters : requested_output_dict [ \"parameters\" ] = to_http_parameters ( requested_output . parameters ) requested_outputs . append ( requested_output_dict ) res = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"model_name\" : self . model_name , \"inputs\" : infer_inputs , } if requested_outputs : res [ \"outputs\" ] = requested_outputs if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_inputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_inputs ) return infer_response_bytes , json_length return res , None def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: ModelInferRequest gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) request_outputs = [] if self . request_outputs : for request_output in self . request_outputs : request_output_dict = { \"name\" : request_output . name } if request_output . parameters : request_output_dict [ \"parameters\" ] = to_grpc_parameters ( request_output . parameters ) request_outputs . append ( request_output_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , outputs = request_outputs if request_outputs else None , ) def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) def get_input_by_name ( self , name : str ) -> Optional [ InferInput ]: \"\"\"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists \"\"\" for infer_input in self . inputs : if name == infer_input . name : return infer_input return None def __eq__ ( self , other ): if not isinstance ( other , InferRequest ): return False if self . model_name != other . model_name : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . inputs != other . inputs : return False if self . request_outputs != other . request_outputs : return False return True def to_dict ( self ) -> dict : return { \"id\" : self . id , \"model_name\" : self . model_name , \"inputs\" : [ infer_input . to_dict () for infer_input in self . inputs ], \"parameters\" : self . parameters , \"from_grpc\" : self . from_grpc , } def __repr__ ( self ) -> str : return ( f '\"id\": \" { self . id } \",' f '\"model_name\": \" { self . model_name } \",' f '\"inputs\": { self . inputs . __repr__ () } ,' f '\"parameters\": { self . parameters } ,' f '\"from_grpc\": { self . from_grpc } ' ) def __str__ ( self ) -> str : return self . __repr__ ()","title":"InferRequest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.use_binary_outputs","text":"This attribute is used to determine if all the outputs should be returned as raw binary format. For REST, Get the binary_data_output attribute from the parameters. This will be ovverided by the individual output's 'binary_data' parameter. For GRPC, It is True, if the received inputs are raw_inputs, otherwise False. For GRPC, if the inputs are raw_inputs, then the outputs should be returned as raw_outputs. Returns: Type Description bool a boolean indicating whether to use binary raw outputs","title":"use_binary_outputs"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.__init__","text":"InferRequest Data Model. Parameters: Name Type Description Default model_name str The model name. required infer_inputs List [ InferInput ] The inference inputs for the model. required request_id Optional [ str ] The id for the inference request. None raw_inputs The binary data for the inference inputs. None from_grpc Optional [ bool ] Indicate if the data model is constructed from gRPC request. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None request_outputs Optional [ List [ RequestedOutput ]] The output tensors requested for this inference. None Source code in kserve/protocol/infer_type.py 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , request_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. request_outputs: The output tensors requested for this inference. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc self . _use_raw_outputs = False if raw_inputs : self . _use_raw_outputs = True for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input self . request_outputs = request_outputs","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.as_dataframe","text":"Decode the tensor inputs as pandas dataframe. Returns: Type Description DataFrame The inference input data as pandas dataframe Source code in kserve/protocol/infer_type.py 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 )","title":"as_dataframe"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.from_bytes","text":"The class method to construct the InferRequest object from REST raw request bytes. Parameters: Name Type Description Default req_bytes bytes The raw InferRequest in bytes. required json_length int The length of the json bytes. required model_name str The name of the model. required Returns: Name Type Description InferRequest InferRequest The resulting InferRequest object. Raises: Type Description InvalidInput If the request format is unrecognized or if necessary fields are missing. Source code in kserve/protocol/infer_type.py 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 @classmethod def from_bytes ( cls , req_bytes : bytes , json_length : int , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from REST raw request bytes. Args: req_bytes (bytes): The raw InferRequest in bytes. json_length (int): The length of the json bytes. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized or if necessary fields are missing. \"\"\" json_bytes = req_bytes [: json_length ] try : infer_req_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) infer_inputs = [] # Read the raw binary inputs appended after json start_index = json_length for input_ in infer_req_dict [ \"inputs\" ]: parameters = input_ . get ( \"parameters\" , None ) infer_input = InferInput ( name = input_ [ \"name\" ], shape = input_ [ \"shape\" ], datatype = input_ [ \"datatype\" ], parameters = parameters , ) infer_input_data = input_ . get ( \"data\" , None ) if infer_input_data is not None : if infer_input . datatype == \"FP16\" : raise InvalidInput ( f \"Receiving FP16 data via JSON is not supported. Please use the binary data format \" f \"for input { infer_input . name } \" ) infer_input . data = infer_input_data elif parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" , None ) if binary_data_size is None : raise InvalidInput ( f \"'binary_data_size' is not specified for input ' { infer_input . name } ' for model ' { model_name } '\" ) end_index = start_index + binary_data_size infer_input . _raw_data = req_bytes [ start_index : end_index ] infer_input . set_data_from_numpy ( infer_input . as_numpy (), binary_data = False ) start_index = end_index else : raise InvalidInput ( f \"'data' field is missing for input ' { infer_input . name } ' for model ' { model_name } '\" ) infer_inputs . append ( infer_input ) requested_outputs = None if infer_req_dict . get ( \"outputs\" , None ) is not None : requested_outputs = [ RequestedOutput ( name = output [ \"name\" ], parameters = output . get ( \"parameters\" , None ), ) for output in infer_req_dict [ \"outputs\" ] ] return cls ( model_name = model_name , request_id = infer_req_dict . get ( \"id\" , None ), parameters = infer_req_dict . get ( \"parameters\" , None ), infer_inputs = infer_inputs , request_outputs = requested_outputs , )","title":"from_bytes"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.from_grpc","text":"Class method to construct an InferRequest object from a ModelInferRequest object. Parameters: Name Type Description Default request ModelInferRequest The gRPC ModelInferRequest object to be converted. required Returns: Name Type Description InferRequest InferRequest The resulting InferRequest object. Source code in kserve/protocol/infer_type.py 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 @classmethod def from_grpc ( cls , request : ModelInferRequest ) -> \"InferRequest\" : \"\"\" Class method to construct an InferRequest object from a ModelInferRequest object. Args: request (ModelInferRequest): The gRPC ModelInferRequest object to be converted. Returns: InferRequest: The resulting InferRequest object. \"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = ( to_http_parameters ( input_tensor . parameters ) if input_tensor . parameters else None ), ) for input_tensor in request . inputs ] request_outputs = [ RequestedOutput ( name = output . name , parameters = ( to_http_parameters ( output . parameters ) if output . parameters else None ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , request_outputs = request_outputs if request_outputs else None , )","title":"from_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.from_inference_request","text":"The class method to construct the InferRequest object from InferenceRequest object. Parameters: Name Type Description Default request InferenceRequest The InferenceRequest object. required model_name str The name of the model. required Source code in kserve/protocol/infer_type.py 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 @classmethod def from_inference_request ( cls , request : InferenceRequest , model_name : str ) -> \"InferRequest\" : \"\"\"The class method to construct the InferRequest object from InferenceRequest object. Args: request (InferenceRequest): The InferenceRequest object. model_name (str): The name of the model. Returns: InferRequest: The resulting InferRequest object. Raises: InvalidInput: If the request format is unrecognized. \"\"\" infer_inputs = [] for infer_input in request . inputs : if infer_input . datatype == \"FP16\" and len ( infer_input . data ) != 0 : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. \" f \"Please use the binary data format for input { infer_input . name } \" ) infer_inputs . append ( InferInput ( name = infer_input . name , shape = infer_input . shape , datatype = infer_input . datatype , data = infer_input . data , parameters = ( {} if infer_input . parameters is None else infer_input . parameters ), ) ) requested_outputs = None if request . outputs : requested_outputs = [ RequestedOutput ( name = output . name , parameters = ({} if output . parameters is None else output . parameters ), ) for output in request . outputs ] return cls ( request_id = request . id , model_name = model_name , infer_inputs = infer_inputs , parameters = request . parameters , request_outputs = requested_outputs , )","title":"from_inference_request"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.get_input_by_name","text":"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists Source code in kserve/protocol/infer_type.py 860 861 862 863 864 865 866 867 868 869 870 871 872 873 def get_input_by_name ( self , name : str ) -> Optional [ InferInput ]: \"\"\"Find an input Tensor in the InferenceRequest that has the given name Args: name : str name of the input Tensor object Returns: InferInput The InferInput with the specified name, or None if no input with this name exists \"\"\" for infer_input in self . inputs : if name == infer_input . name : return infer_input return None","title":"get_input_by_name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.to_grpc","text":"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: Type Description ModelInferRequest ModelInferRequest gRPC type converted from InferRequest object. Source code in kserve/protocol/infer_type.py 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: ModelInferRequest gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) request_outputs = [] if self . request_outputs : for request_output in self . request_outputs : request_output_dict = { \"name\" : request_output . name } if request_output . parameters : request_output_dict [ \"parameters\" ] = to_grpc_parameters ( request_output . parameters ) request_outputs . append ( request_output_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , outputs = request_outputs if request_outputs else None , )","title":"to_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.to_rest","text":"Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Type Description Tuple [ Union [ bytes , Dict ], Optional [ int ]] Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: Type Description InvalidInput If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. Source code in kserve/protocol/infer_type.py 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferRequest object to v2 REST InferRequest Dict or bytes. This method is used to convert the InferRequest object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferRequest in bytes or Dict and the length of the JSON part of the request. If it is dict, then the json length will be None. Raises: InvalidInput: If the data is missing for an input or if both 'data' and 'raw_data' fields are set for an input. \"\"\" infer_inputs = [] raw_inputs = [] for infer_input in self . inputs : if infer_input . data is None and infer_input . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_input . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_input . data , np . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) if infer_input . data and infer_input . _raw_data : raise InvalidInput ( f \"Both 'data' and 'raw_data' fields are set for input ' { infer_input . name } ' for model ' { self . model_name } '\" ) if infer_input . datatype == \"FP16\" and infer_input . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for input { infer_input . name } \" ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if infer_input . _raw_data : raw_inputs . append ( infer_input . _raw_data ) else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) requested_outputs = [] if self . request_outputs : for requested_output in self . request_outputs : requested_output_dict = { \"name\" : requested_output . name , } if requested_output . parameters : requested_output_dict [ \"parameters\" ] = to_http_parameters ( requested_output . parameters ) requested_outputs . append ( requested_output_dict ) res = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"model_name\" : self . model_name , \"inputs\" : infer_inputs , } if requested_outputs : res [ \"outputs\" ] = requested_outputs if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_inputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_inputs ) return infer_response_bytes , json_length return res , None","title":"to_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse","text":"Source code in kserve/protocol/infer_type.py 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 class InferResponse : id : str model_name : str model_version : Optional [ str ] parameters : Optional [ Dict ] outputs : List [ InferOutput ] from_grpc : bool def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , use_binary_outputs : Optional [ bool ] = False , requested_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. use_binary_outputs: A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. requested_outputs: The output tensors requested for this inference. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc self . _requested_outputs = requested_outputs self . _use_binary_outputs : bool = use_binary_outputs if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type. Args: response: The GRPC response as ModelInferResponse object. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) @classmethod def from_rest ( cls , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type. Args: response: The response as a dict. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = response . get ( \"model_name\" ), model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) @classmethod def from_bytes ( cls , res_bytes : bytes , json_length : int , ) -> \"InferResponse\" : \"\"\" Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Args: res_bytes (bytes): The raw response bytes received from the REST API. json_length (int): The length of the JSON part of the response. Returns: InferResponse: The constructed InferResponse object. Raises: InvalidInput: If the response format is unrecognized or if necessary fields are missing in the response. InferenceError: if failed to set data for the output tensor. \"\"\" # If json_length is equal to the length of the response bytes, then the response does not have # any appended binary data after the json. json_bytes = res_bytes [: json_length ] try : infer_res_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) model_name = infer_res_dict [ \"model_name\" ] infer_outputs = [] # Read the raw binary outputs appended after json start_index = json_length for output in infer_res_dict [ \"outputs\" ]: parameters = output . get ( \"parameters\" , None ) infer_output = InferOutput ( name = output [ \"name\" ], shape = output [ \"shape\" ], datatype = output [ \"datatype\" ], parameters = parameters , ) if parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" ) end_index = start_index + binary_data_size infer_output . _raw_data = res_bytes [ start_index : end_index ] infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = False ) start_index = end_index else : infer_output_data = output . get ( \"data\" , None ) if infer_output_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { model_name } '\" ) infer_output . data = infer_output_data infer_outputs . append ( infer_output ) return cls ( model_name = model_name , response_id = infer_res_dict . get ( \"id\" , None ), parameters = infer_res_dict . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: InvalidInput: If the output data is not a numpy array, bytes, or list. \"\"\" infer_outputs = [] raw_outputs = [] use_binary_data = self . _use_binary_outputs outputs = self . _requested_outputs if self . _requested_outputs else self . outputs for output in outputs : infer_output = ( self . get_output_by_name ( output . name ) if self . _requested_outputs else output ) if self . _requested_outputs : use_binary_data = output . binary_data if infer_output is None : raise InvalidInput ( f \"Unexpected inference output ' { output . name } ' for model ' { self . model_name } '\" ) if infer_output . data is None and infer_output . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = use_binary_data ) elif infer_output . data or infer_output . _raw_data : infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = use_binary_data ) if infer_output . datatype == \"FP16\" and infer_output . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for output { infer_output . name } \" ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if use_binary_data : raw_outputs . append ( infer_output . _raw_data ) else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_outputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_outputs ) return infer_response_bytes , json_length return res , None def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. Raises: InvalidInput: If the output data is not a List or if the datatype is invalid. \"\"\" infer_outputs = [] raw_output_contents = [] use_raw_outputs = self . _use_binary_outputs if not self . _use_binary_outputs : # If FP16 datatype is present in the outputs use raw outputs. if _contains_fp16_datatype ( self ): use_raw_outputs = True for infer_output in self . outputs : if ( use_raw_outputs and infer_output . data and isinstance ( infer_output . data , list ) ): infer_output . data = infer_output . as_numpy () if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) def get_output_by_name ( self , name : str ) -> Optional [ InferOutput ]: \"\"\"Find an output Tensor in the InferResponse that has the given name Args: name : str name of the output Tensor object Returns: InferOutput The InferOutput with the specified name, or None if no output with this name exists \"\"\" for infer_output in self . outputs : if name == infer_output . name : return infer_output return None def __eq__ ( self , other ): if not isinstance ( other , InferResponse ): return False if self . model_name != other . model_name : return False if self . model_version != other . model_version : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . outputs != other . outputs : return False return True def to_dict ( self ) -> dict : return { \"id\" : self . id , \"model_name\" : self . model_name , \"outputs\" : [ infer_output . to_dict () for infer_output in self . outputs ], \"parameters\" : self . parameters , \"from_grpc\" : self . from_grpc , } def __repr__ ( self ) -> str : return ( f '\"id\": \" { self . id } \",' f '\"model_name\": \" { self . model_name } \",' f '\"outputs\": { self . outputs . __repr__ () } ,' f '\"parameters\": { self . parameters } ,' f '\"from_grpc\": { self . from_grpc } ' ) def __str__ ( self ) -> str : return self . __repr__ ()","title":"InferResponse"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.__init__","text":"The InferResponse Data Model Parameters: Name Type Description Default response_id str The id of the inference response. required model_name str The name of the model. required infer_outputs List [ InferOutput ] The inference outputs of the inference response. required model_version Optional [ str ] The version of the model. None raw_outputs The raw binary data of the inference outputs. None from_grpc Optional [ bool ] Indicate if the InferResponse is constructed from a gRPC response. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None use_binary_outputs Optional [ bool ] A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. False requested_outputs Optional [ List [ RequestedOutput ]] The output tensors requested for this inference. None Source code in kserve/protocol/infer_type.py 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , use_binary_outputs : Optional [ bool ] = False , requested_outputs : Optional [ List [ RequestedOutput ]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. use_binary_outputs: A boolean indicating whether the data for the outputs should be in binary format when sent over REST API. This will be overridden by the individual output's binary_data attribute. requested_outputs: The output tensors requested for this inference. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc self . _requested_outputs = requested_outputs self . _use_binary_outputs : bool = use_binary_outputs if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.from_bytes","text":"Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Parameters: Name Type Description Default res_bytes bytes The raw response bytes received from the REST API. required json_length int The length of the JSON part of the response. required Returns: Name Type Description InferResponse InferResponse The constructed InferResponse object. Raises: Type Description InvalidInput If the response format is unrecognized or if necessary fields are missing in the response. InferenceError if failed to set data for the output tensor. Source code in kserve/protocol/infer_type.py 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 @classmethod def from_bytes ( cls , res_bytes : bytes , json_length : int , ) -> \"InferResponse\" : \"\"\" Class method to construct an InferResponse object from raw response bytes. This method is used to convert the raw response bytes received from a REST API into an InferResponse object. Args: res_bytes (bytes): The raw response bytes received from the REST API. json_length (int): The length of the JSON part of the response. Returns: InferResponse: The constructed InferResponse object. Raises: InvalidInput: If the response format is unrecognized or if necessary fields are missing in the response. InferenceError: if failed to set data for the output tensor. \"\"\" # If json_length is equal to the length of the response bytes, then the response does not have # any appended binary data after the json. json_bytes = res_bytes [: json_length ] try : infer_res_dict = orjson . loads ( json_bytes ) except orjson . JSONDecodeError as e : raise InvalidInput ( f \"Unrecognized request format: { e } \" ) model_name = infer_res_dict [ \"model_name\" ] infer_outputs = [] # Read the raw binary outputs appended after json start_index = json_length for output in infer_res_dict [ \"outputs\" ]: parameters = output . get ( \"parameters\" , None ) infer_output = InferOutput ( name = output [ \"name\" ], shape = output [ \"shape\" ], datatype = output [ \"datatype\" ], parameters = parameters , ) if parameters and \"binary_data_size\" in parameters : binary_data_size = parameters . get ( \"binary_data_size\" ) end_index = start_index + binary_data_size infer_output . _raw_data = res_bytes [ start_index : end_index ] infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = False ) start_index = end_index else : infer_output_data = output . get ( \"data\" , None ) if infer_output_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { model_name } '\" ) infer_output . data = infer_output_data infer_outputs . append ( infer_output ) return cls ( model_name = model_name , response_id = infer_res_dict . get ( \"id\" , None ), parameters = infer_res_dict . get ( \"parameters\" , None ), infer_outputs = infer_outputs , )","title":"from_bytes"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.from_grpc","text":"The class method to construct the InferResponse object from gRPC message type. Parameters: Name Type Description Default response ModelInferResponse The GRPC response as ModelInferResponse object. required Source code in kserve/protocol/infer_type.py 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type. Args: response: The GRPC response as ModelInferResponse object. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , )","title":"from_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.from_rest","text":"The class method to construct the InferResponse object from REST message type. Parameters: Name Type Description Default response Dict The response as a dict. required Source code in kserve/protocol/infer_type.py 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 @classmethod def from_rest ( cls , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type. Args: response: The response as a dict. Returns: InferResponse object. \"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = response . get ( \"model_name\" ), model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , )","title":"from_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.get_output_by_name","text":"Find an output Tensor in the InferResponse that has the given name Parameters: Name Type Description Default name str name of the output Tensor object required Source code in kserve/protocol/infer_type.py 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 def get_output_by_name ( self , name : str ) -> Optional [ InferOutput ]: \"\"\"Find an output Tensor in the InferResponse that has the given name Args: name : str name of the output Tensor object Returns: InferOutput The InferOutput with the specified name, or None if no output with this name exists \"\"\" for infer_output in self . outputs : if name == infer_output . name : return infer_output return None","title":"get_output_by_name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.to_grpc","text":"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: Type Description ModelInferResponse The ModelInferResponse gRPC message. Source code in kserve/protocol/infer_type.py 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. Raises: InvalidInput: If the output data is not a List or if the datatype is invalid. \"\"\" infer_outputs = [] raw_output_contents = [] use_raw_outputs = self . _use_binary_outputs if not self . _use_binary_outputs : # If FP16 datatype is present in the outputs use raw outputs. if _contains_fp16_datatype ( self ): use_raw_outputs = True for infer_output in self . outputs : if ( use_raw_outputs and infer_output . data and isinstance ( infer_output . data , list ) ): infer_output . data = infer_output . as_numpy () if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , )","title":"to_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.to_rest","text":"Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Type Description Tuple [ Union [ bytes , Dict ], Optional [ int ]] Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: Type Description InvalidInput If the output data is not a numpy array, bytes, or list. Source code in kserve/protocol/infer_type.py 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 def to_rest ( self ) -> Tuple [ Union [ bytes , Dict ], Optional [ int ]]: \"\"\" Converts the InferResponse object to v2 REST InferResponse Dict or bytes. This method is used to convert the InferResponse object into a format that can be sent over a REST API. Returns: Tuple[Union[bytes, Dict], Optional[int]]: A tuple containing the InferResponse in bytes or Dict and the length of the JSON part of the response. If it is dict, then the json length will be None. Raises: InvalidInput: If the output data is not a numpy array, bytes, or list. \"\"\" infer_outputs = [] raw_outputs = [] use_binary_data = self . _use_binary_outputs outputs = self . _requested_outputs if self . _requested_outputs else self . outputs for output in outputs : infer_output = ( self . get_output_by_name ( output . name ) if self . _requested_outputs else output ) if self . _requested_outputs : use_binary_data = output . binary_data if infer_output is None : raise InvalidInput ( f \"Unexpected inference output ' { output . name } ' for model ' { self . model_name } '\" ) if infer_output . data is None and infer_output . _raw_data is None : raise InvalidInput ( f \"'data' field is missing for output ' { infer_output . name } ' for model ' { self . model_name } '\" ) if isinstance ( infer_output . data , np . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = use_binary_data ) elif infer_output . data or infer_output . _raw_data : infer_output . set_data_from_numpy ( infer_output . as_numpy (), binary_data = use_binary_data ) if infer_output . datatype == \"FP16\" and infer_output . data : raise InvalidInput ( f \"Sending FP16 data via JSON is not supported. Please use the binary data format for output { infer_output . name } \" ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if use_binary_data : raw_outputs . append ( infer_output . _raw_data ) else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) if len ( raw_outputs ) != 0 : infer_response_bytes = orjson . dumps ( res ) json_length = len ( infer_response_bytes ) infer_response_bytes = b \"\" . join ([ infer_response_bytes ] + raw_outputs ) return infer_response_bytes , json_length return res , None","title":"to_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput","text":"Source code in kserve/protocol/infer_type.py 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 class RequestedOutput : def __init__ ( self , name : str , parameters : Optional [ Dict ] = None ): \"\"\" The RequestedOutput class represents an output that is requested as part of an inference request. Args: name (str): The name of the output. parameters (Optional[Dict]): Additional parameters for the output. \"\"\" self . _name = name self . _parameters = parameters @property def name ( self ) -> str : \"\"\" Get the name of the output. Returns: str: The name of the output. \"\"\" return self . _name @property def parameters ( self ) -> Optional [ Dict ]: \"\"\" Get the additional parameters for the output. Returns: Optional[Dict]: The additional parameters for the output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] ): \"\"\"Set the parameters of the inference input associated with this object. Args: params: parameters of the inference input \"\"\" self . _parameters = params @property def binary_data ( self ) -> Optional [ bool ]: \"\"\"Get the binary_data attribute from the parameters. This attribute indicates whether the data for the input should be in binary format. Returns: bool or None: True if the data should be in binary format, False otherwise. If the binary_data attribute is not set, returns None. \"\"\" if self . parameters and \"binary_data\" in self . parameters : return self . parameters [ \"binary_data\" ] else : return None def __eq__ ( self , other ): if not isinstance ( other , RequestedOutput ): return False if self . name != other . name : return False if self . parameters != other . parameters : return False return True def __repr__ ( self ): return f \"RequestedOutput(name= { self . name } , parameters= { self . parameters } )\" def __str__ ( self ): return self . __repr__ ()","title":"RequestedOutput"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput.binary_data","text":"Get the binary_data attribute from the parameters. This attribute indicates whether the data for the input should be in binary format. Returns: Type Description Optional [ bool ] bool or None: True if the data should be in binary format, False otherwise. If the binary_data attribute is not set, returns None.","title":"binary_data"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput.name","text":"Get the name of the output. Returns: Name Type Description str str The name of the output.","title":"name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput.parameters","text":"Get the additional parameters for the output. Returns: Type Description Optional [ Dict ] Optional[Dict]: The additional parameters for the output.","title":"parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.RequestedOutput.__init__","text":"The RequestedOutput class represents an output that is requested as part of an inference request. Parameters: Name Type Description Default name str The name of the output. required parameters Optional [ Dict ] Additional parameters for the output. None Source code in kserve/protocol/infer_type.py 408 409 410 411 412 413 414 415 416 417 def __init__ ( self , name : str , parameters : Optional [ Dict ] = None ): \"\"\" The RequestedOutput class represents an output that is requested as part of an inference request. Args: name (str): The name of the output. parameters (Optional[Dict]): Additional parameters for the output. \"\"\" self . _name = name self . _parameters = parameters","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.deserialize_bytes_tensor","text":"Deserializes an encoded bytes tensor into a numpy array of dtype of python objects Parameters: Name Type Description Default encoded_tensor bytes The encoded bytes tensor where each element has its length in first 4 bytes followed by the content required Source code in kserve/protocol/infer_type.py 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 def deserialize_bytes_tensor ( encoded_tensor : bytes ) -> np . ndarray : \"\"\" Deserializes an encoded bytes tensor into a numpy array of dtype of python objects Args: encoded_tensor : bytes The encoded bytes tensor where each element has its length in first 4 bytes followed by the content Returns: string_tensor : np.array The 1-D numpy array of type object containing the deserialized bytes in row-major form. \"\"\" strs = list () offset = 0 val_buf = encoded_tensor while offset < len ( val_buf ): length = struct . unpack_from ( \"<I\" , val_buf , offset )[ 0 ] offset += 4 sb = struct . unpack_from ( \"< {} s\" . format ( length ), val_buf , offset )[ 0 ] offset += length strs . append ( sb ) return np . array ( strs , dtype = np . object_ )","title":"deserialize_bytes_tensor"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.serialize_byte_tensor","text":"Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object. For np.bytes, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Parameters: Name Type Description Default input_tensor np.array The bytes tensor to serialize. required Source code in kserve/protocol/infer_type.py 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 def serialize_byte_tensor ( input_tensor : np . ndarray ) -> np . ndarray : \"\"\" Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object. For np.bytes, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Args: input_tensor : np.array The bytes tensor to serialize. Returns: serialized_bytes_tensor : np.array The 1-D numpy array of type uint8 containing the serialized bytes in row-major form. Raises: InferenceError If unable to serialize the given tensor. \"\"\" if input_tensor . size == 0 : return np . empty ([ 0 ], dtype = np . object_ ) # If the input is a tensor of string/bytes objects, then must flatten those into # a 1-dimensional array containing the 4-byte byte size followed by the # actual element bytes. All elements are concatenated together in row-major # order. if ( input_tensor . dtype != np . object_ ) and ( input_tensor . dtype . type != np . bytes_ ): raise InferenceError ( \"cannot serialize bytes tensor: invalid datatype\" ) flattened_ls = [] # 'C' order is row-major. for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # If directly passing bytes to BYTES type, # don't convert it to str as Python will encode the # bytes which may distort the meaning if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : s = obj . item () else : s = str ( obj . item ()) . encode ( \"utf-8\" ) else : s = obj . item () flattened_ls . append ( struct . pack ( \"<I\" , len ( s ))) flattened_ls . append ( s ) flattened = b \"\" . join ( flattened_ls ) flattened_array = np . asarray ( flattened , dtype = np . object_ ) if not flattened_array . flags [ \"C_CONTIGUOUS\" ]: flattened_array = np . ascontiguousarray ( flattened_array , dtype = np . object_ ) return flattened_array","title":"serialize_byte_tensor"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.to_grpc_parameters","text":"Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. Source code in kserve/protocol/infer_type.py 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 def to_grpc_parameters ( parameters : Union [ Dict [ str , Union [ str , bool , int ]], MessageMap [ str , InferParameter ]] ) -> Dict [ str , InferParameter ]: \"\"\" Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. \"\"\" grpc_params : Dict [ str , InferParameter ] = {} for key , val in parameters . items (): if isinstance ( val , str ): grpc_params [ key ] = InferParameter ( string_param = val ) elif isinstance ( val , bool ): grpc_params [ key ] = InferParameter ( bool_param = val ) elif isinstance ( val , int ): grpc_params [ key ] = InferParameter ( int64_param = val ) elif isinstance ( val , InferParameter ): grpc_params [ key ] = val else : raise InvalidInput ( f \"to_grpc: invalid parameter value: { val } \" ) return grpc_params","title":"to_grpc_parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.to_http_parameters","text":"Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] Source code in kserve/protocol/infer_type.py 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 def to_http_parameters ( parameters : Union [ dict , MessageMap [ str , InferParameter ]] ) -> Dict [ str , Union [ str , bool , int ]]: \"\"\" Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] \"\"\" http_params : Dict [ str , Union [ str , bool , int ]] = {} for key , val in parameters . items (): if isinstance ( val , InferParameter ): if val . HasField ( \"bool_param\" ): http_params [ key ] = val . bool_param elif val . HasField ( \"int64_param\" ): http_params [ key ] = val . int64_param elif val . HasField ( \"string_param\" ): http_params [ key ] = val . string_param else : http_params [ key ] = val return http_params","title":"to_http_parameters"},{"location":"reference/api/","text":"Packages: serving.kserve.io/v1alpha1 serving.kserve.io/v1beta1 serving.kserve.io/v1alpha1 Package v1alpha1 contains API Schema definitions for the serving v1alpha1 API group Resource Types: BuiltInAdapter ( Appears on: ServingRuntimeSpec ) Field Description serverType ServerType ServerType must be one of the supported built-in types such as \u201ctriton\u201d or \u201cmlserver\u201d, and the runtime\u2019s container must have the same name runtimeManagementPort int Port which the runtime server listens for model management requests memBufferBytes int Fixed memory overhead to subtract from runtime container\u2019s memory allocation to determine model capacity modelLoadingTimeoutMillis int Timeout for model loading operations in milliseconds env []Kubernetes core/v1.EnvVar Environment variables used to control other aspects of the built-in adapter\u2019s behaviour (uncommon) ClusterServingRuntime ClusterServingRuntime is the Schema for the servingruntimes API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec ServingRuntimeSpec supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter status ServingRuntimeStatus ClusterStorageContainer Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec StorageContainerSpec container Kubernetes core/v1.Container Container spec for the storage initializer init container supportedUriFormats []SupportedUriFormat List of URI formats that this container supports disabled bool (Optional) InferenceGraph InferenceGraph is the Schema for the InferenceGraph API for multiple models Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec InferenceGraphSpec nodes map[string]kserve.io/serving/pkg/apis/serving/v1alpha1.InferenceRouter Map of InferenceGraph router nodes Each node defines the router which can be different routing types resources Kubernetes core/v1.ResourceRequirements (Optional) affinity Kubernetes core/v1.Affinity (Optional) timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). status InferenceGraphStatus InferenceGraphSpec ( Appears on: InferenceGraph ) InferenceGraphSpec defines the InferenceGraph spec Field Description nodes map[string]kserve.io/serving/pkg/apis/serving/v1alpha1.InferenceRouter Map of InferenceGraph router nodes Each node defines the router which can be different routing types resources Kubernetes core/v1.ResourceRequirements (Optional) affinity Kubernetes core/v1.Affinity (Optional) timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). InferenceGraphStatus ( Appears on: InferenceGraph ) InferenceGraphStatus defines the InferenceGraph conditions and status Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for InferenceGraph url knative.dev/pkg/apis.URL (Optional) Url for the InferenceGraph InferenceRouter ( Appears on: InferenceGraphSpec ) InferenceRouter defines the router for each InferenceGraph node with one or multiple steps kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \"{ .input.userId == 1 }\" - service: mymodel2 condition: \"{ .input.userId == 2 }\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \u201cmajority-vote\u201d method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \"dog\" } - service: cat-breed-classifier condition: { .predictions.class == \"cat\" } Field Description routerType InferenceRouterType RouterType Sequence: chain multiple inference steps with input/output from previous step Splitter: randomly routes to the target service according to the weight Ensemble: routes the request to multiple models and then merge the responses Switch: routes the request to one of the steps based on condition steps []InferenceStep (Optional) Steps defines destinations for the current router node InferenceRouterType ( string alias) ( Appears on: InferenceRouter ) InferenceRouterType constant for inference routing types Value Description \"Ensemble\" Ensemble router routes the requests to multiple models and then merge the responses \"Sequence\" Sequence Default type only route to one destination \"Splitter\" Splitter router randomly routes the requests to the named service according to the weight \"Switch\" Switch routes the request to the model based on certain condition InferenceStep ( Appears on: InferenceRouter ) InferenceStep defines the inference target of the current step with condition, weights and data. Field Description name string (Optional) Unique name for the step within this node InferenceTarget InferenceTarget (Members of InferenceTarget are embedded into this type.) Node or service used to process this step data string (Optional) request data sent to the next route with input/output from the previous step $request $response.predictions weight int64 (Optional) the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 condition string (Optional) routing based on the condition dependency InferenceStepDependencyType (Optional) to decide whether a step is a hard or a soft dependency in the Inference Graph InferenceStepDependencyType ( string alias) ( Appears on: InferenceStep ) InferenceStepDependencyType constant for inference step dependency Value Description \"Hard\" Hard \"Soft\" Soft InferenceTarget ( Appears on: InferenceStep ) Exactly one InferenceTarget field must be specified Field Description nodeName string (Optional) The node name for routing as next step serviceName string named reference for InferenceService serviceUrl string (Optional) InferenceService URL, mutually exclusive with ServiceName ModelSpec ( Appears on: TrainedModelSpec ) ModelSpec describes a TrainedModel Field Description storageUri string Storage URI for the model repository framework string Machine Learning The values could be: \u201ctensorflow\u201d,\u201cpytorch\u201d,\u201csklearn\u201d,\u201connx\u201d,\u201cxgboost\u201d, \u201cmyawesomeinternalframework\u201d etc. memory k8s.io/apimachinery/pkg/api/resource.Quantity Maximum memory this model will consume, this field is used to decide if a model server has enough memory to load this model. ScaleMetric ( string alias) ( Appears on: InferenceGraphSpec ) ScaleMetric enum ServerType ( string alias) ( Appears on: BuiltInAdapter ) ServerType constant for specifying the runtime name Value Description \"mlserver\" Model server is MLServer \"ovms\" Model server is OpenVino Model Server \"triton\" Model server is Triton ServingRuntime ServingRuntime is the Schema for the servingruntimes API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec ServingRuntimeSpec supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter status ServingRuntimeStatus ServingRuntimePodSpec ( Appears on: ServingRuntimeSpec ) Field Description containers []Kubernetes core/v1.Container List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. volumes []Kubernetes core/v1.Volume (Optional) List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes nodeSelector map[string]string (Optional) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node\u2019s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ affinity Kubernetes core/v1.Affinity (Optional) If specified, the pod\u2019s scheduling constraints tolerations []Kubernetes core/v1.Toleration (Optional) If specified, the pod\u2019s tolerations. labels map[string]string (Optional) Labels that will be add to the pod. More info: http://kubernetes.io/docs/user-guide/labels annotations map[string]string (Optional) Annotations that will be add to the pod. More info: http://kubernetes.io/docs/user-guide/annotations imagePullSecrets []Kubernetes core/v1.LocalObjectReference (Optional) ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod ServingRuntimeSpec ( Appears on: ClusterServingRuntime , ServingRuntime , SupportedRuntime ) ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out. Field Description supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter ServingRuntimeStatus ( Appears on: ClusterServingRuntime , ServingRuntime ) ServingRuntimeStatus defines the observed state of ServingRuntime StorageContainerSpec ( Appears on: ClusterStorageContainer ) StorageContainerSpec defines the container spec for the storage initializer init container, and the protocols it supports. Field Description container Kubernetes core/v1.Container Container spec for the storage initializer init container supportedUriFormats []SupportedUriFormat List of URI formats that this container supports StorageHelper ( Appears on: ServingRuntimeSpec ) Field Description disabled bool (Optional) SupportedModelFormat ( Appears on: ServingRuntimeSpec ) Field Description name string Name of the model format. version string (Optional) Version of the model format. Used in validating that a predictor is supported by a runtime. Can be \u201cmajor\u201d, \u201cmajor.minor\u201d or \u201cmajor.minor.patch\u201d. autoSelect bool (Optional) Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. priority int32 (Optional) Priority of this serving runtime for auto selection. This is used to select the serving runtime if more than one serving runtime supports the same model format. The value should be greater than zero. The higher the value, the higher the priority. Priority is not considered if AutoSelect is either false or not specified. Priority can be overridden by specifying the runtime in the InferenceService. SupportedRuntime SupportedRuntime is the schema for supported runtime result of automatic selection Field Description Name string Spec ServingRuntimeSpec SupportedUriFormat ( Appears on: StorageContainerSpec ) SupportedUriFormat can be either prefix or regex. Todo: Add validation that only one of them is set. Field Description prefix string regex string TrainedModel TrainedModel is the Schema for the TrainedModel API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec TrainedModelSpec inferenceService string parent inference service to deploy to model ModelSpec Predictor model spec status TrainedModelStatus TrainedModelSpec ( Appears on: TrainedModel ) TrainedModelSpec defines the TrainedModel spec Field Description inferenceService string parent inference service to deploy to model ModelSpec Predictor model spec TrainedModelStatus ( Appears on: TrainedModel ) TrainedModelStatus defines the observed state of TrainedModel Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for trained model url knative.dev/pkg/apis.URL URL holds the url that will distribute traffic over the provided traffic targets. For v1: http[s]://{route-name}.{route-namespace}.{cluster-level-suffix}/v1/models/ :predict For v2: http[s]://{route-name}.{route-namespace}.{cluster-level-suffix}/v2/models/ /infer address knative.dev/pkg/apis/duck/v1.Addressable Addressable endpoint for the deployed trained model http:// /v1/models/ .metadata.name Generated with gen-crd-api-reference-docs on git commit 1c51eeee . serving.kserve.io/v1beta1 Package v1beta1 contains API Schema definitions for the serving v1beta1 API group Resource Types: ARTExplainerSpec ( Appears on: ExplainerSpec ) ARTExplainerType defines the arguments for configuring an ART Explanation Server Field Description type ARTExplainerType The type of ART explainer ExplainerExtensionSpec ExplainerExtensionSpec (Members of ExplainerExtensionSpec are embedded into this type.) Contains fields shared across all explainers ARTExplainerType ( string alias) ( Appears on: ARTExplainerSpec ) Value Description \"SquareAttack\" Batcher ( Appears on: ComponentExtensionSpec ) Batcher specifies optional payload batching available for all components Field Description maxBatchSize int (Optional) Specifies the max number of requests to trigger a batch maxLatency int (Optional) Specifies the max latency to trigger a batch timeout int (Optional) Specifies the timeout of a batch Component Component interface is implemented by all specs that contain component implementations, e.g. PredictorSpec, ExplainerSpec, TransformerSpec. ComponentExtensionSpec ( Appears on: ExplainerSpec , PredictorSpec , TransformerSpec ) ComponentExtensionSpec defines the deployment configuration for a given InferenceService component Field Description minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). containerConcurrency int64 (Optional) ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency( https://knative.dev/docs/serving/autoscaling/concurrency ). timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. canaryTrafficPercent int64 (Optional) CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision logger LoggerSpec (Optional) Activate request/response logging and logger configurations batcher Batcher (Optional) Activate request batching and batching configurations labels map[string]string (Optional) Labels that will be add to the component pod. More info: http://kubernetes.io/docs/user-guide/labels annotations map[string]string (Optional) Annotations that will be add to the component pod. More info: http://kubernetes.io/docs/user-guide/annotations deploymentStrategy Kubernetes apps/v1.DeploymentStrategy (Optional) The deployment strategy to use to replace existing pods with new ones. Only applicable for raw deployment mode. ComponentImplementation ComponentImplementation interface is implemented by predictor, transformer, and explainer implementations ComponentStatusSpec ( Appears on: InferenceServiceStatus ) ComponentStatusSpec describes the state of the component Field Description latestReadyRevision string (Optional) Latest revision name that is in ready state latestCreatedRevision string (Optional) Latest revision name that is created previousRolledoutRevision string (Optional) Previous revision name that is rolled out with 100 percent traffic latestRolledoutRevision string (Optional) Latest revision name that is rolled out with 100 percent traffic traffic []knative.dev/serving/pkg/apis/serving/v1.TrafficTarget (Optional) Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. url knative.dev/pkg/apis.URL (Optional) URL holds the primary url that will distribute traffic over the provided traffic targets. This will be one the REST or gRPC endpoints that are available. It generally has the form http[s]://{route-name}.{route-namespace}.{cluster-level-suffix} restUrl knative.dev/pkg/apis.URL (Optional) REST endpoint of the component if available. grpcUrl knative.dev/pkg/apis.URL (Optional) gRPC endpoint of the component if available. address knative.dev/pkg/apis/duck/v1.Addressable (Optional) Addressable endpoint for the InferenceService ComponentType ( string alias) ComponentType contains the different types of components of the service Value Description \"explainer\" \"predictor\" \"transformer\" CustomExplainer CustomExplainer defines arguments for configuring a custom explainer. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) CustomPredictor CustomPredictor defines arguments for configuring a custom server. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) CustomTransformer CustomTransformer defines arguments for configuring a custom transformer. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) DeployConfig Field Description defaultDeploymentMode string ExplainerConfig ( Appears on: ExplainersConfig ) Field Description image string explainer docker image name defaultImageVersion string default explainer docker image version ExplainerExtensionSpec ( Appears on: ARTExplainerSpec ) ExplainerExtensionSpec defines configuration shared across all explainer frameworks Field Description storageUri string The location of a trained explanation model runtimeVersion string Defaults to latest Explainer Version config map[string]string Inline custom parameter settings for explainer Container Kubernetes core/v1.Container (Members of Container are embedded into this type.) (Optional) Container enables overrides for the predictor. Each framework will have different defaults that are populated in the underlying container spec. storage StorageSpec (Optional) Storage Spec for model location ExplainerSpec ( Appears on: InferenceServiceSpec ) ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \u201c1-of\u201d semantic. Users must specify exactly one spec. Field Description art ARTExplainerSpec Spec for ART explainer PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Users may choose to provide a full PodSpec for their custom explainer. The field PodSpec.Containers is mutually exclusive with other explainers. 2) Users may choose to provide a Explainer and specify PodSpec overrides in the PodSpec. They must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for explainer ExplainersConfig ( Appears on: InferenceServicesConfig ) Field Description art ExplainerConfig FailureInfo ( Appears on: ModelStatus ) Field Description location string (Optional) Name of component to which the failure relates (usually Pod name) reason FailureReason (Optional) High level class of failure message string (Optional) Detailed error message modelRevisionName string (Optional) Internal Revision/ID of model, tied to specific Spec contents time Kubernetes meta/v1.Time (Optional) Time failure occurred or was discovered exitCode int32 (Optional) Exit status from the last termination of the container FailureReason ( string alias) ( Appears on: FailureInfo ) FailureReason enum Value Description \"InvalidPredictorSpec\" The current Predictor Spec is invalid or unsupported \"ModelLoadFailed\" The model failed to load within a ServingRuntime container \"NoSupportingRuntime\" There are no ServingRuntime which support the specified model type \"RuntimeDisabled\" The ServingRuntime is disabled \"RuntimeNotRecognized\" There is no ServingRuntime defined with the specified runtime name \"RuntimeUnhealthy\" Corresponding ServingRuntime containers failed to start or are unhealthy HuggingFaceRuntimeSpec ( Appears on: PredictorSpec ) HuggingFaceRuntimeSpec defines arguments for configuring HuggingFace model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors InferenceService InferenceService is the Schema for the InferenceServices API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec InferenceServiceSpec predictor PredictorSpec Predictor defines the model serving spec explainer ExplainerSpec (Optional) Explainer defines the model explanation service spec, explainer service calls to predictor or transformer if it is specified. transformer TransformerSpec (Optional) Transformer defines the pre/post processing before and after the predictor call, transformer service calls to predictor service. status InferenceServiceStatus InferenceServiceSpec ( Appears on: InferenceService ) InferenceServiceSpec is the top level type for this resource Field Description predictor PredictorSpec Predictor defines the model serving spec explainer ExplainerSpec (Optional) Explainer defines the model explanation service spec, explainer service calls to predictor or transformer if it is specified. transformer TransformerSpec (Optional) Transformer defines the pre/post processing before and after the predictor call, transformer service calls to predictor service. InferenceServiceStatus ( Appears on: InferenceService ) InferenceServiceStatus defines the observed state of InferenceService Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for the InferenceService - PredictorReady: predictor readiness condition; - TransformerReady: transformer readiness condition; - ExplainerReady: explainer readiness condition; - RoutesReady (serverless mode only): aggregated routing condition, i.e. endpoint readiness condition; - LatestDeploymentReady (serverless mode only): aggregated configuration condition, i.e. latest deployment readiness condition; - Ready: aggregated condition; address knative.dev/pkg/apis/duck/v1.Addressable (Optional) Addressable endpoint for the InferenceService url knative.dev/pkg/apis.URL (Optional) URL holds the url that will distribute traffic over the provided traffic targets. It generally has the form http[s]://{route-name}.{route-namespace}.{cluster-level-suffix} components map[kserve.io/serving/pkg/apis/serving/v1beta1.ComponentType]kserve.io/serving/pkg/apis/serving/v1beta1.ComponentStatusSpec Statuses for the components of the InferenceService modelStatus ModelStatus Model related statuses InferenceServicesConfig Field Description explainers ExplainersConfig Explainer configurations IngressConfig Field Description ingressGateway string ingressService string localGateway string localGatewayService string ingressDomain string ingressClassName string additionalIngressDomains []string domainTemplate string urlScheme string disableIstioVirtualHost bool pathTemplate string disableIngressCreation bool LightGBMSpec ( Appears on: PredictorSpec ) LightGBMSpec defines arguments for configuring LightGBMSpec model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors LoggerSpec ( Appears on: ComponentExtensionSpec ) LoggerSpec specifies optional payload logging available for all components Field Description url string (Optional) URL to send logging events mode LoggerType (Optional) Specifies the scope of the loggers. Valid values are: - \u201call\u201d (default): log both request and response; - \u201crequest\u201d: log only request; - \u201cresponse\u201d: log only response LoggerType ( string alias) ( Appears on: LoggerSpec ) LoggerType controls the scope of log publishing Value Description \"all\" Logger mode to log both request and response \"request\" Logger mode to log only request \"response\" Logger mode to log only response ModelCopies ( Appears on: ModelStatus ) Field Description failedCopies int How many copies of this predictor\u2019s models failed to load recently totalCopies int (Optional) Total number copies of this predictor\u2019s models that are currently loaded ModelFormat ( Appears on: ModelSpec ) Field Description name string Name of the model format. version string (Optional) Version of the model format. Used in validating that a predictor is supported by a runtime. Can be \u201cmajor\u201d, \u201cmajor.minor\u201d or \u201cmajor.minor.patch\u201d. ModelRevisionStates ( Appears on: ModelStatus ) Field Description activeModelState ModelState High level state string: Pending, Standby, Loading, Loaded, FailedToLoad targetModelState ModelState ModelSpec ( Appears on: PredictorSpec ) Field Description modelFormat ModelFormat ModelFormat being served. runtime string (Optional) Specific ClusterServingRuntime/ServingRuntime name to use for deployment. PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) ModelState ( string alias) ( Appears on: ModelRevisionStates ) ModelState enum Value Description \"FailedToLoad\" All copies of the model failed to load \"Loaded\" At least one copy of the model is loaded \"Loading\" Model is loading \"Pending\" Model is not yet registered \"Standby\" Model is available but not loaded (will load when used) ModelStatus ( Appears on: InferenceServiceStatus ) Field Description transitionStatus TransitionStatus Whether the available predictor endpoints reflect the current Spec or is in transition states ModelRevisionStates (Optional) State information of the predictor\u2019s model. lastFailureInfo FailureInfo (Optional) Details of last failure, when load of target model is failed or blocked. copies ModelCopies (Optional) Model copy information of the predictor\u2019s model. ONNXRuntimeSpec ( Appears on: PredictorSpec ) ONNXRuntimeSpec defines arguments for configuring ONNX model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors PMMLSpec ( Appears on: PredictorSpec ) PMMLSpec defines arguments for configuring PMML model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors PaddleServerSpec ( Appears on: PredictorSpec ) Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) PodSpec ( Appears on: ExplainerSpec , PredictorSpec , TransformerSpec ) PodSpec is a description of a pod. Field Description volumes []Kubernetes core/v1.Volume (Optional) List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes initContainers []Kubernetes core/v1.Container List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ containers []Kubernetes core/v1.Container List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. ephemeralContainers []Kubernetes core/v1.EphemeralContainer (Optional) List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod\u2019s ephemeralcontainers subresource. This field is beta-level and available on clusters that haven\u2019t disabled the EphemeralContainers feature gate. restartPolicy Kubernetes core/v1.RestartPolicy (Optional) Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy terminationGracePeriodSeconds int64 (Optional) Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. activeDeadlineSeconds int64 (Optional) Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. dnsPolicy Kubernetes core/v1.DNSPolicy (Optional) Set DNS policy for the pod. Defaults to \u201cClusterFirst\u201d. Valid values are \u2018ClusterFirstWithHostNet\u2019, \u2018ClusterFirst\u2019, \u2018Default\u2019 or \u2018None\u2019. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to \u2018ClusterFirstWithHostNet\u2019. nodeSelector map[string]string (Optional) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node\u2019s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ serviceAccountName string (Optional) ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ serviceAccount string (Optional) DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. automountServiceAccountToken bool (Optional) AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. nodeName string (Optional) NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. hostNetwork bool (Optional) Host networking requested for this pod. Use the host\u2019s network namespace. If this option is set, the ports that will be used must be specified. Default to false. hostPID bool (Optional) Use the host\u2019s pid namespace. Optional: Default to false. hostIPC bool (Optional) Use the host\u2019s ipc namespace. Optional: Default to false. shareProcessNamespace bool (Optional) Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. securityContext Kubernetes core/v1.PodSecurityContext (Optional) SecurityContext holds pod-level security attributes and common container settings. Optional: Defaults to empty. See type description for default values of each field. imagePullSecrets []Kubernetes core/v1.LocalObjectReference (Optional) ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod hostname string (Optional) Specifies the hostname of the Pod If not specified, the pod\u2019s hostname will be set to a system-defined value. subdomain string (Optional) If specified, the fully qualified Pod hostname will be \u201c . . .svc. \u201d. If not specified, the pod will not have a domainname at all. affinity Kubernetes core/v1.Affinity (Optional) If specified, the pod\u2019s scheduling constraints schedulerName string (Optional) If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. tolerations []Kubernetes core/v1.Toleration (Optional) If specified, the pod\u2019s tolerations. hostAliases []Kubernetes core/v1.HostAlias (Optional) HostAliases is an optional list of hosts and IPs that will be injected into the pod\u2019s hosts file if specified. This is only valid for non-hostNetwork pods. priorityClassName string (Optional) If specified, indicates the pod\u2019s priority. \u201csystem-node-critical\u201d and \u201csystem-cluster-critical\u201d are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. priority int32 (Optional) The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. dnsConfig Kubernetes core/v1.PodDNSConfig (Optional) Specifies the DNS parameters of a pod. Parameters specified here will be merged to the generated DNS configuration based on DNSPolicy. readinessGates []Kubernetes core/v1.PodReadinessGate (Optional) If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \u201cTrue\u201d More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates runtimeClassName string (Optional) RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \u201clegacy\u201d RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. enableServiceLinks bool (Optional) EnableServiceLinks indicates whether information about services should be injected into pod\u2019s environment variables, matching the syntax of Docker links. Optional: Defaults to true. preemptionPolicy Kubernetes core/v1.PreemptionPolicy (Optional) PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. overhead Kubernetes core/v1.ResourceList (Optional) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. topologySpreadConstraints []Kubernetes core/v1.TopologySpreadConstraint (Optional) TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. setHostnameAsFQDN bool (Optional) If true the pod\u2019s hostname will be configured as the pod\u2019s FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. os Kubernetes core/v1.PodOS (Optional) Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set. If the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions If the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[ ].securityContext.seLinuxOptions - spec.containers[ ].securityContext.seccompProfile - spec.containers[ ].securityContext.capabilities - spec.containers[ ].securityContext.readOnlyRootFilesystem - spec.containers[ ].securityContext.privileged - spec.containers[ ].securityContext.allowPrivilegeEscalation - spec.containers[ ].securityContext.procMount - spec.containers[ ].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup This is an alpha field and requires the IdentifyPodOS feature hostUsers bool (Optional) Use the host\u2019s user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature. schedulingGates []Kubernetes core/v1.PodSchedulingGate (Optional) SchedulingGates is an opaque list of values that if specified will block scheduling the pod. If schedulingGates is not empty, the pod will stay in the SchedulingGated state and the scheduler will not attempt to schedule the pod. SchedulingGates can only be set at pod creation time, and be removed only afterwards. This is a beta feature enabled by the PodSchedulingReadiness feature gate. resourceClaims []Kubernetes core/v1.PodResourceClaim (Optional) ResourceClaims defines which ResourceClaims must be allocated and reserved before the Pod is allowed to start. The resources will be made available to those containers which consume them by name. This is an alpha field and requires enabling the DynamicResourceAllocation feature gate. This field is immutable. PredictorExtensionSpec ( Appears on: HuggingFaceRuntimeSpec , LightGBMSpec , ModelSpec , ONNXRuntimeSpec , PMMLSpec , PaddleServerSpec , SKLearnSpec , TFServingSpec , TorchServeSpec , TritonSpec , XGBoostSpec ) PredictorExtensionSpec defines configuration shared across all predictor frameworks Field Description storageUri string (Optional) This field points to the location of the trained model which is mounted onto the pod. runtimeVersion string (Optional) Runtime version of the predictor docker image protocolVersion github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) Container Kubernetes core/v1.Container (Members of Container are embedded into this type.) (Optional) Container enables overrides for the predictor. Each framework will have different defaults that are populated in the underlying container spec. storage StorageSpec (Optional) Storage Spec for model location PredictorImplementation PredictorImplementation defines common functions for all predictors e.g Tensorflow, Triton, etc PredictorSpec ( Appears on: InferenceServiceSpec ) PredictorSpec defines the configuration for a predictor, The following fields follow a \u201c1-of\u201d semantic. Users must specify exactly one spec. Field Description sklearn SKLearnSpec Spec for SKLearn model server xgboost XGBoostSpec Spec for XGBoost model server tensorflow TFServingSpec Spec for TFServing ( https://github.com/tensorflow/serving ) pytorch TorchServeSpec Spec for TorchServe ( https://pytorch.org/serve ) triton TritonSpec Spec for Triton Inference Server ( https://github.com/triton-inference-server/server ) onnx ONNXRuntimeSpec Spec for ONNX runtime ( https://github.com/microsoft/onnxruntime ) huggingface HuggingFaceRuntimeSpec Spec for HuggingFace runtime ( https://github.com/huggingface ) pmml PMMLSpec Spec for PMML ( http://dmg.org/pmml/v4-1/GeneralStructure.html ) lightgbm LightGBMSpec Spec for LightGBM model server paddle PaddleServerSpec Spec for Paddle model server ( https://github.com/PaddlePaddle/Serving ) model ModelSpec Model spec for any arbitrary framework. PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Provide a full PodSpec for custom predictor. The field PodSpec.Containers is mutually exclusive with other predictors (i.e. TFServing). 2) Provide a predictor (i.e. TFServing) and specify PodSpec overrides, you must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for a predictor SKLearnSpec ( Appears on: PredictorSpec ) SKLearnSpec defines arguments for configuring SKLearn model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors ScaleMetric ( string alias) ( Appears on: ComponentExtensionSpec ) ScaleMetric enum Value Description \"cpu\" \"concurrency\" \"memory\" \"rps\" StorageSpec ( Appears on: ExplainerExtensionSpec , PredictorExtensionSpec ) Field Description path string (Optional) The path to the model object in the storage. It cannot co-exist with the storageURI. schemaPath string (Optional) The path to the model schema file in the storage. parameters map[string]string (Optional) Parameters to override the default storage credentials and config. key string (Optional) The Storage Key in the secret for this model. TFServingSpec ( Appears on: PredictorSpec ) TFServingSpec defines arguments for configuring Tensorflow model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors TorchServeSpec ( Appears on: PredictorSpec ) TorchServeSpec defines arguments for configuring PyTorch model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors TransformerSpec ( Appears on: InferenceServiceSpec ) TransformerSpec defines transformer service for pre/post processing Field Description PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Provide a full PodSpec for custom transformer. The field PodSpec.Containers is mutually exclusive with other transformers. 2) Provide a transformer and specify PodSpec overrides, you must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for a transformer TransitionStatus ( string alias) ( Appears on: ModelStatus ) TransitionStatus enum Value Description \"BlockedByFailedLoad\" Target model failed to load \"InProgress\" Waiting for target model to reach state of active model \"InvalidSpec\" Target predictor spec failed validation \"UpToDate\" Predictor is up-to-date (reflects current spec) TritonSpec ( Appears on: PredictorSpec ) TritonSpec defines arguments for configuring Triton model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors XGBoostSpec ( Appears on: PredictorSpec ) XGBoostSpec defines arguments for configuring XGBoost model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors Generated with gen-crd-api-reference-docs on git commit 1c51eeee .","title":"Control Plane API"},{"location":"reference/swagger-ui/","text":"Open Inference Protocol API Specification \u00b6 REST \u00b6 GRPC \u00b6 ServerLive \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse ServerReady \u00b6 The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse ModelReady \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse ServerMetadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse ModelMetadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse ModelInfer \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse Messages \u00b6 InferParameter \u00b6 An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value. InferTensorContents \u00b6 The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. ModelInferRequest \u00b6 Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. | ModelInferRequest.InferInputTensor \u00b6 An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request. ModelInferRequest.InferInputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.InferRequestedOutputTensor \u00b6 An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters. ModelInferRequest.InferRequestedOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse \u00b6 Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. | ModelInferResponse.InferOutputTensor \u00b6 An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response. ModelInferResponse.InferOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelMetadataRequest \u00b6 Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelMetadataResponse \u00b6 Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs. ModelMetadataResponse.TensorMetadata \u00b6 Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value. ModelReadyRequest \u00b6 Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelReadyResponse \u00b6 Field Type Description ready bool True if the model is ready, false if not ready. ServerLiveRequest \u00b6 ServerLiveResponse \u00b6 Field Type Description live bool True if the inference server is live, false if not live. ServerMetadataRequest \u00b6 ServerMetadataResponse \u00b6 Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server. ServerReadyRequest \u00b6 ServerReadyResponse \u00b6 Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"Open Inference Protocol API Spec"},{"location":"reference/swagger-ui/#open-inference-protocol-api-specification","text":"","title":"Open Inference Protocol API Specification"},{"location":"reference/swagger-ui/#rest","text":"","title":"REST"},{"location":"reference/swagger-ui/#grpc","text":"","title":"GRPC"},{"location":"reference/swagger-ui/#serverlive","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse","title":"ServerLive"},{"location":"reference/swagger-ui/#serverready","text":"The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse","title":"ServerReady"},{"location":"reference/swagger-ui/#modelready","text":"The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse","title":"ModelReady"},{"location":"reference/swagger-ui/#servermetadata","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse","title":"ServerMetadata"},{"location":"reference/swagger-ui/#modelmetadata","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse","title":"ModelMetadata"},{"location":"reference/swagger-ui/#modelinfer","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse","title":"ModelInfer"},{"location":"reference/swagger-ui/#messages","text":"","title":"Messages"},{"location":"reference/swagger-ui/#inferparameter","text":"An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value.","title":"InferParameter"},{"location":"reference/swagger-ui/#infertensorcontents","text":"The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements.","title":"InferTensorContents"},{"location":"reference/swagger-ui/#modelinferrequest","text":"Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. |","title":"ModelInferRequest"},{"location":"reference/swagger-ui/#modelinferrequestinferinputtensor","text":"An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request.","title":"ModelInferRequest.InferInputTensor"},{"location":"reference/swagger-ui/#modelinferrequestinferinputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferInputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferrequestinferrequestedoutputtensor","text":"An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters.","title":"ModelInferRequest.InferRequestedOutputTensor"},{"location":"reference/swagger-ui/#modelinferrequestinferrequestedoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferRequestedOutputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferrequestparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferresponse","text":"Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. |","title":"ModelInferResponse"},{"location":"reference/swagger-ui/#modelinferresponseinferoutputtensor","text":"An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response.","title":"ModelInferResponse.InferOutputTensor"},{"location":"reference/swagger-ui/#modelinferresponseinferoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.InferOutputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferresponseparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.ParametersEntry"},{"location":"reference/swagger-ui/#modelmetadatarequest","text":"Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelMetadataRequest"},{"location":"reference/swagger-ui/#modelmetadataresponse","text":"Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs.","title":"ModelMetadataResponse"},{"location":"reference/swagger-ui/#modelmetadataresponsetensormetadata","text":"Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value.","title":"ModelMetadataResponse.TensorMetadata"},{"location":"reference/swagger-ui/#modelreadyrequest","text":"Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelReadyRequest"},{"location":"reference/swagger-ui/#modelreadyresponse","text":"Field Type Description ready bool True if the model is ready, false if not ready.","title":"ModelReadyResponse"},{"location":"reference/swagger-ui/#serverliverequest","text":"","title":"ServerLiveRequest"},{"location":"reference/swagger-ui/#serverliveresponse","text":"Field Type Description live bool True if the inference server is live, false if not live.","title":"ServerLiveResponse"},{"location":"reference/swagger-ui/#servermetadatarequest","text":"","title":"ServerMetadataRequest"},{"location":"reference/swagger-ui/#servermetadataresponse","text":"Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server.","title":"ServerMetadataResponse"},{"location":"reference/swagger-ui/#serverreadyrequest","text":"","title":"ServerReadyRequest"},{"location":"reference/swagger-ui/#serverreadyresponse","text":"Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"ServerReadyResponse"},{"location":"reference/v2_inference/","text":"ServerLive \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse ServerReady \u00b6 The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse ModelReady \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse ServerMetadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse ModelMetadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse ModelInfer \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse Messages \u00b6 InferParameter \u00b6 An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value. InferTensorContents \u00b6 The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. ModelInferRequest \u00b6 Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. | ModelInferRequest.InferInputTensor \u00b6 An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request. ModelInferRequest.InferInputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.InferRequestedOutputTensor \u00b6 An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters. ModelInferRequest.InferRequestedOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse \u00b6 Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. | ModelInferResponse.InferOutputTensor \u00b6 An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response. ModelInferResponse.InferOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelMetadataRequest \u00b6 Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelMetadataResponse \u00b6 Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs. ModelMetadataResponse.TensorMetadata \u00b6 Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value. ModelReadyRequest \u00b6 Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelReadyResponse \u00b6 Field Type Description ready bool True if the model is ready, false if not ready. ServerLiveRequest \u00b6 ServerLiveResponse \u00b6 Field Type Description live bool True if the inference server is live, false if not live. ServerMetadataRequest \u00b6 ServerMetadataResponse \u00b6 Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server. ServerReadyRequest \u00b6 ServerReadyResponse \u00b6 Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"Index"},{"location":"reference/v2_inference/#serverlive","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse","title":"ServerLive"},{"location":"reference/v2_inference/#serverready","text":"The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse","title":"ServerReady"},{"location":"reference/v2_inference/#modelready","text":"The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse","title":"ModelReady"},{"location":"reference/v2_inference/#servermetadata","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse","title":"ServerMetadata"},{"location":"reference/v2_inference/#modelmetadata","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse","title":"ModelMetadata"},{"location":"reference/v2_inference/#modelinfer","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse","title":"ModelInfer"},{"location":"reference/v2_inference/#messages","text":"","title":"Messages"},{"location":"reference/v2_inference/#inferparameter","text":"An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value.","title":"InferParameter"},{"location":"reference/v2_inference/#infertensorcontents","text":"The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements.","title":"InferTensorContents"},{"location":"reference/v2_inference/#modelinferrequest","text":"Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. |","title":"ModelInferRequest"},{"location":"reference/v2_inference/#modelinferrequestinferinputtensor","text":"An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request.","title":"ModelInferRequest.InferInputTensor"},{"location":"reference/v2_inference/#modelinferrequestinferinputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferInputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferrequestinferrequestedoutputtensor","text":"An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters.","title":"ModelInferRequest.InferRequestedOutputTensor"},{"location":"reference/v2_inference/#modelinferrequestinferrequestedoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferRequestedOutputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferrequestparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.ParametersEntry"},{"location":"reference/v2_inference/#modelinferresponse","text":"Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. |","title":"ModelInferResponse"},{"location":"reference/v2_inference/#modelinferresponseinferoutputtensor","text":"An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response.","title":"ModelInferResponse.InferOutputTensor"},{"location":"reference/v2_inference/#modelinferresponseinferoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.InferOutputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferresponseparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.ParametersEntry"},{"location":"reference/v2_inference/#modelmetadatarequest","text":"Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelMetadataRequest"},{"location":"reference/v2_inference/#modelmetadataresponse","text":"Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs.","title":"ModelMetadataResponse"},{"location":"reference/v2_inference/#modelmetadataresponsetensormetadata","text":"Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value.","title":"ModelMetadataResponse.TensorMetadata"},{"location":"reference/v2_inference/#modelreadyrequest","text":"Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelReadyRequest"},{"location":"reference/v2_inference/#modelreadyresponse","text":"Field Type Description ready bool True if the model is ready, false if not ready.","title":"ModelReadyResponse"},{"location":"reference/v2_inference/#serverliverequest","text":"","title":"ServerLiveRequest"},{"location":"reference/v2_inference/#serverliveresponse","text":"Field Type Description live bool True if the inference server is live, false if not live.","title":"ServerLiveResponse"},{"location":"reference/v2_inference/#servermetadatarequest","text":"","title":"ServerMetadataRequest"},{"location":"reference/v2_inference/#servermetadataresponse","text":"Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server.","title":"ServerMetadataResponse"},{"location":"reference/v2_inference/#serverreadyrequest","text":"","title":"ServerReadyRequest"},{"location":"reference/v2_inference/#serverreadyresponse","text":"Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"ServerReadyResponse"},{"location":"reference/v2_inference/template/","text":"Macro Syntax Error \u00b6 File : reference/v2_inference/template.md Line 1 in Markdown file: unexpected char '$' at 8 {{range $file := .Files}}","title":"Macro Syntax Error"},{"location":"reference/v2_inference/template/#macro-syntax-error","text":"File : reference/v2_inference/template.md Line 1 in Markdown file: unexpected char '$' at 8 {{range $file := .Files}}","title":"Macro Syntax Error"},{"location":"sdk_docs/sdk_doc/","text":"KServe Python SDK \u00b6 Python SDK for KServe controller plane client and data plane serving runtime API. Installation \u00b6 KServe Python SDK can be installed by pip or poetry . pip install \u00b6 pip install kserve Poetry \u00b6 Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install KServe Serving Runtime API \u00b6 KServe's python serving runtime API implements the open inference protocol using FastAPI , see Serving Runtime API docs for more details. KServe Client API \u00b6 KServe's python client interacts with KServe control plane APIs for executing operations on a remote KServe cluster, such as creating, patching and deleting of a InferenceService instance. Getting Started \u00b6 Please see the Sample for Python SDK Client to get started. KServe Client API Reference \u00b6 Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready Reference for Generated Data Models \u00b6 KnativeAddressable KnativeCondition KnativeURL KnativeVolatileTime NetUrlUserinfo V1beta1AIXExplainerSpec V1beta1AlibiExplainerSpec V1beta1Batcher V1beta1ComponentExtensionSpec V1beta1ComponentStatusSpec V1beta1CustomExplainer V1beta1CustomPredictor V1beta1CustomTransformer V1beta1ExplainerSpec V1beta1InferenceService V1beta1InferenceServiceList V1beta1InferenceServiceSpec V1beta1InferenceServiceStatus V1alpha1InferenceGraph V1alpha1InferenceGraphList V1alpha1InferenceGraphSpec V1alpha1InferenceGraphStatus V1beta1LightGBMSpec V1beta1LoggerSpec V1beta1ModelSpec V1beta1ModelStatus V1beta1ONNXRuntimeSpec V1beta1PaddleServerSpec V1beta1PMMLSpec V1beta1PodSpec V1beta1PredictorExtensionSpec V1beta1PredictorSpec V1beta1SKLearnSpec V1beta1TFServingSpec V1beta1TorchServeSpec V1beta1TransformerSpec V1beta1TritonSpec V1beta1XGBoostSpec","title":"Python Client SDK"},{"location":"sdk_docs/sdk_doc/#kserve-python-sdk","text":"Python SDK for KServe controller plane client and data plane serving runtime API.","title":"KServe Python SDK"},{"location":"sdk_docs/sdk_doc/#installation","text":"KServe Python SDK can be installed by pip or poetry .","title":"Installation"},{"location":"sdk_docs/sdk_doc/#pip-install","text":"pip install kserve","title":"pip install"},{"location":"sdk_docs/sdk_doc/#poetry","text":"Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install","title":"Poetry"},{"location":"sdk_docs/sdk_doc/#kserve-serving-runtime-api","text":"KServe's python serving runtime API implements the open inference protocol using FastAPI , see Serving Runtime API docs for more details.","title":"KServe Serving Runtime API"},{"location":"sdk_docs/sdk_doc/#kserve-client-api","text":"KServe's python client interacts with KServe control plane APIs for executing operations on a remote KServe cluster, such as creating, patching and deleting of a InferenceService instance.","title":"KServe Client API"},{"location":"sdk_docs/sdk_doc/#getting-started","text":"Please see the Sample for Python SDK Client to get started.","title":"Getting Started"},{"location":"sdk_docs/sdk_doc/#kserve-client-api-reference","text":"Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready","title":"KServe Client API Reference"},{"location":"sdk_docs/sdk_doc/#reference-for-generated-data-models","text":"KnativeAddressable KnativeCondition KnativeURL KnativeVolatileTime NetUrlUserinfo V1beta1AIXExplainerSpec V1beta1AlibiExplainerSpec V1beta1Batcher V1beta1ComponentExtensionSpec V1beta1ComponentStatusSpec V1beta1CustomExplainer V1beta1CustomPredictor V1beta1CustomTransformer V1beta1ExplainerSpec V1beta1InferenceService V1beta1InferenceServiceList V1beta1InferenceServiceSpec V1beta1InferenceServiceStatus V1alpha1InferenceGraph V1alpha1InferenceGraphList V1alpha1InferenceGraphSpec V1alpha1InferenceGraphStatus V1beta1LightGBMSpec V1beta1LoggerSpec V1beta1ModelSpec V1beta1ModelStatus V1beta1ONNXRuntimeSpec V1beta1PaddleServerSpec V1beta1PMMLSpec V1beta1PodSpec V1beta1PredictorExtensionSpec V1beta1PredictorSpec V1beta1SKLearnSpec V1beta1TFServingSpec V1beta1TorchServeSpec V1beta1TransformerSpec V1beta1TritonSpec V1beta1XGBoostSpec","title":"Reference for Generated Data Models"},{"location":"sdk_docs/docs/KServeClient/","text":"KServeClient \u00b6 KServeClient(config_file=None, context=None, client_configuration=None, persist_config=True) User can loads authentication and cluster information from kube-config file and stores them in kubernetes.client.configuration. Parameters are as following: parameter Description config_file Name of the kube-config file. Defaults to ~/.kube/config . Note that for the case that the SDK is running in cluster and you want to operate KServe in another remote cluster, user must set config_file to load kube-config file explicitly, e.g. KServeClient(config_file=\"~/.kube/config\") . context Set the active context. If is set to None, current_context from config file will be used. client_configuration The kubernetes.client.Configuration to set configs to. persist_config If True, config file will be updated when changed (e.g GCP token refresh). The APIs for KServeClient are as following: Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready set_credentials \u00b6 set_credentials(storage_type, namespace=None, credentials_file=None, service_account='kfserving-service-credentials', **kwargs): Create or update a Secret and Service Account for GCS and S3 for the provided credentials. Once the Service Account is applied, it may be used in the Service Account field of a InferenceService's V1beta1ModelSpec . Example \u00b6 Example for creating GCP credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'GCS' , namespace = 'kubeflow' , credentials_file = '/tmp/gcp.json' , service_account = 'user_specified_sa_name' ) The API supports specifying a Service Account by service_account , or using default Service Account kfserving-service-credentials , if the Service Account does not exist, the API will create it and attach the created secret with the Service Account, if exists, only patch it to attach the created Secret. Example for creating S3 credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'S3' , namespace = 'kubeflow' , credentials_file = '/tmp/awcredentials' , s3_profile = 'default' , s3_endpoint = 's3.us-west-amazonaws.com' , s3_region = 'us-west-2' , s3_use_https = '1' , s3_verify_ssl = '0' ) Example for creating Azure credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'Azure' , namespace = 'kubeflow' , credentials_file = '/path/azure_credentials.json' ) The created or patched Secret and Service Account will be shown as following: INFO:kfserving.api.set_credentials:Created Secret: kfserving-secret-6tv6l in namespace kubeflow INFO:kfserving.api.set_credentials:Created (or Patched) Service account: kfserving-service-credentials in namespace kubeflow Parameters \u00b6 Name Type Storage Type Description storage_type str All Required. Valid values: GCS, S3 or Azure namespace str All Optional. The kubernetes namespace. Defaults to current or default namespace. credentials_file str All Optional. The path for the credentials file. The default file for GCS is ~/.config/gcloud/application_default_credentials.json , see the instructions on creating the GCS credentials file. For S3 is ~/.aws/credentials , see the instructions on creating the S3 credentials file. For Azure is ~/.azure/azure_credentials.json , see the instructions on creating the Azure credentials file. service_account str All Optional. The name of service account. Supports specifying the service_account , or using default Service Account kfserving-service-credentials . If the Service Account does not exist, the API will create it and attach the created Secret with the Service Account, if exists, only patch it to attach the created Secret. s3_endpoint str S3 only Optional. The S3 endpoint. s3_region str S3 only Optional. The S3 region By default, regional endpoint is used for S3. s3_use_https str S3 only Optional. HTTPS is used to access S3 by default, unless s3_use_https=0 s3_verify_ssl str S3 only Optional. If HTTPS is used, SSL verification could be disabled with s3_verify_ssl=0 create \u00b6 create(inferenceservice, namespace=None, watch=False, timeout_seconds=600) Create the provided InferenceService in the specified namespace Example \u00b6 from kubernetes import client from kserve import KServeClient from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = default_model_spec ) kserve = KServeClient () kserve . create ( isvc ) # The API also supports watching the created InferenceService status till it's READY. # kserve.create(isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str Namespace for InferenceService deploying to. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the created InferenceService if True , otherwise will return the created InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object get \u00b6 get(name=None, namespace=None, watch=False, timeout_seconds=600) Get the created InferenceService in the specified namespace Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' ) The API also support watching the specified InferenceService or all InferenceService in the namespace. from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' , watch = True , timeout_seconds = 120 ) The outputs will be as following. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . NAME READY DEFAULT_TRAFFIC CANARY_TRAFFIC URL flower-sample Unknown http://flower-sample.kubeflow.example.com flower-sample Unknown 90 10 http://flower-sample.kubeflow.example.com flower-sample True 90 10 http://flower-sample.kubeflow.example.com Parameters \u00b6 Name Type Description Notes name str InferenceService name. If the name is not specified, it will get or watch all InferenceServices in the namespace. Optional. namespace str The InferenceService's namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService or all InferenceService in the namespace if True , otherwise will return object for the specified InferenceService or all InferenceService in the namespace. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the speficed InferenceService overall status READY is True (Only if the name is speficed). Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object patch \u00b6 patch(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Patch the created InferenceService in the specified namespace. Note that if you want to set the field from existing value to None , patch API may not work, you need to use replace API to remove the field value. Example \u00b6 from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 10 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . patch ( service_name , isvc ) # The API also supports watching the patached InferenceService status till it's READY. # kserve.patch('flower-sample', isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace for patching. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the patched InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object replace \u00b6 replace(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Replace the created InferenceService in the specified namespace. Generally use the replace API to update whole InferenceService or remove a field such as canary or other components of the InferenceService. Example \u00b6 from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 0 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . replace ( service_name , isvc ) # The API also supports watching the replaced InferenceService status till it's READY. # kserve.replace('flower-sample', isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the replaced InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object delete \u00b6 delete(name, namespace=None) Delete the created InferenceService in the specified namespace Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . delete ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str InferenceService name namespace str The inferenceservice's namespace. Defaults to current or default namespace. Optional Return type \u00b6 object wait_isvc_ready \u00b6 wait_isvc_ready(name, namespace=None, watch=False, timeout_seconds=600, polling_interval=10): Wait for the InferenceService to be ready. Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . wait_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService if True . Optional timeout_seconds int How long to wait for the InferenceService, default wait for 600 seconds. Optional polling_interval int How often to poll for the status of the InferenceService. Optional Return type \u00b6 object is_isvc_ready \u00b6 is_isvc_ready(name, namespace=None) Returns True if the InferenceService is ready; false otherwise. Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . is_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional Return type \u00b6 Bool","title":"KServeClient"},{"location":"sdk_docs/docs/KServeClient/#kserveclient","text":"KServeClient(config_file=None, context=None, client_configuration=None, persist_config=True) User can loads authentication and cluster information from kube-config file and stores them in kubernetes.client.configuration. Parameters are as following: parameter Description config_file Name of the kube-config file. Defaults to ~/.kube/config . Note that for the case that the SDK is running in cluster and you want to operate KServe in another remote cluster, user must set config_file to load kube-config file explicitly, e.g. KServeClient(config_file=\"~/.kube/config\") . context Set the active context. If is set to None, current_context from config file will be used. client_configuration The kubernetes.client.Configuration to set configs to. persist_config If True, config file will be updated when changed (e.g GCP token refresh). The APIs for KServeClient are as following: Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready","title":"KServeClient"},{"location":"sdk_docs/docs/KServeClient/#set_credentials","text":"set_credentials(storage_type, namespace=None, credentials_file=None, service_account='kfserving-service-credentials', **kwargs): Create or update a Secret and Service Account for GCS and S3 for the provided credentials. Once the Service Account is applied, it may be used in the Service Account field of a InferenceService's V1beta1ModelSpec .","title":"set_credentials"},{"location":"sdk_docs/docs/KServeClient/#example","text":"Example for creating GCP credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'GCS' , namespace = 'kubeflow' , credentials_file = '/tmp/gcp.json' , service_account = 'user_specified_sa_name' ) The API supports specifying a Service Account by service_account , or using default Service Account kfserving-service-credentials , if the Service Account does not exist, the API will create it and attach the created secret with the Service Account, if exists, only patch it to attach the created Secret. Example for creating S3 credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'S3' , namespace = 'kubeflow' , credentials_file = '/tmp/awcredentials' , s3_profile = 'default' , s3_endpoint = 's3.us-west-amazonaws.com' , s3_region = 'us-west-2' , s3_use_https = '1' , s3_verify_ssl = '0' ) Example for creating Azure credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'Azure' , namespace = 'kubeflow' , credentials_file = '/path/azure_credentials.json' ) The created or patched Secret and Service Account will be shown as following: INFO:kfserving.api.set_credentials:Created Secret: kfserving-secret-6tv6l in namespace kubeflow INFO:kfserving.api.set_credentials:Created (or Patched) Service account: kfserving-service-credentials in namespace kubeflow","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters","text":"Name Type Storage Type Description storage_type str All Required. Valid values: GCS, S3 or Azure namespace str All Optional. The kubernetes namespace. Defaults to current or default namespace. credentials_file str All Optional. The path for the credentials file. The default file for GCS is ~/.config/gcloud/application_default_credentials.json , see the instructions on creating the GCS credentials file. For S3 is ~/.aws/credentials , see the instructions on creating the S3 credentials file. For Azure is ~/.azure/azure_credentials.json , see the instructions on creating the Azure credentials file. service_account str All Optional. The name of service account. Supports specifying the service_account , or using default Service Account kfserving-service-credentials . If the Service Account does not exist, the API will create it and attach the created Secret with the Service Account, if exists, only patch it to attach the created Secret. s3_endpoint str S3 only Optional. The S3 endpoint. s3_region str S3 only Optional. The S3 region By default, regional endpoint is used for S3. s3_use_https str S3 only Optional. HTTPS is used to access S3 by default, unless s3_use_https=0 s3_verify_ssl str S3 only Optional. If HTTPS is used, SSL verification could be disabled with s3_verify_ssl=0","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#create","text":"create(inferenceservice, namespace=None, watch=False, timeout_seconds=600) Create the provided InferenceService in the specified namespace","title":"create"},{"location":"sdk_docs/docs/KServeClient/#example_1","text":"from kubernetes import client from kserve import KServeClient from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = default_model_spec ) kserve = KServeClient () kserve . create ( isvc ) # The API also supports watching the created InferenceService status till it's READY. # kserve.create(isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_1","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str Namespace for InferenceService deploying to. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the created InferenceService if True , otherwise will return the created InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#get","text":"get(name=None, namespace=None, watch=False, timeout_seconds=600) Get the created InferenceService in the specified namespace","title":"get"},{"location":"sdk_docs/docs/KServeClient/#example_2","text":"from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' ) The API also support watching the specified InferenceService or all InferenceService in the namespace. from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' , watch = True , timeout_seconds = 120 ) The outputs will be as following. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . NAME READY DEFAULT_TRAFFIC CANARY_TRAFFIC URL flower-sample Unknown http://flower-sample.kubeflow.example.com flower-sample Unknown 90 10 http://flower-sample.kubeflow.example.com flower-sample True 90 10 http://flower-sample.kubeflow.example.com","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_2","text":"Name Type Description Notes name str InferenceService name. If the name is not specified, it will get or watch all InferenceServices in the namespace. Optional. namespace str The InferenceService's namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService or all InferenceService in the namespace if True , otherwise will return object for the specified InferenceService or all InferenceService in the namespace. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the speficed InferenceService overall status READY is True (Only if the name is speficed). Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_1","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#patch","text":"patch(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Patch the created InferenceService in the specified namespace. Note that if you want to set the field from existing value to None , patch API may not work, you need to use replace API to remove the field value.","title":"patch"},{"location":"sdk_docs/docs/KServeClient/#example_3","text":"from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 10 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . patch ( service_name , isvc ) # The API also supports watching the patached InferenceService status till it's READY. # kserve.patch('flower-sample', isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_3","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace for patching. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the patched InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_2","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#replace","text":"replace(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Replace the created InferenceService in the specified namespace. Generally use the replace API to update whole InferenceService or remove a field such as canary or other components of the InferenceService.","title":"replace"},{"location":"sdk_docs/docs/KServeClient/#example_4","text":"from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 0 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . replace ( service_name , isvc ) # The API also supports watching the replaced InferenceService status till it's READY. # kserve.replace('flower-sample', isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_4","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the replaced InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_3","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#delete","text":"delete(name, namespace=None) Delete the created InferenceService in the specified namespace","title":"delete"},{"location":"sdk_docs/docs/KServeClient/#example_5","text":"from kserve import KServeClient kserve = KServeClient () kserve . delete ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_5","text":"Name Type Description Notes name str InferenceService name namespace str The inferenceservice's namespace. Defaults to current or default namespace. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_4","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#wait_isvc_ready","text":"wait_isvc_ready(name, namespace=None, watch=False, timeout_seconds=600, polling_interval=10): Wait for the InferenceService to be ready.","title":"wait_isvc_ready"},{"location":"sdk_docs/docs/KServeClient/#example_6","text":"from kserve import KServeClient kserve = KServeClient () kserve . wait_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_6","text":"Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService if True . Optional timeout_seconds int How long to wait for the InferenceService, default wait for 600 seconds. Optional polling_interval int How often to poll for the status of the InferenceService. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_5","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#is_isvc_ready","text":"is_isvc_ready(name, namespace=None) Returns True if the InferenceService is ready; false otherwise.","title":"is_isvc_ready"},{"location":"sdk_docs/docs/KServeClient/#example_7","text":"from kserve import KServeClient kserve = KServeClient () kserve . is_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_7","text":"Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_6","text":"Bool","title":"Return type"},{"location":"sdk_docs/docs/KnativeAddressable/","text":"KnativeAddressable \u00b6 Properties \u00b6 Name Type Description Notes url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"KnativeAddressable"},{"location":"sdk_docs/docs/KnativeAddressable/#knativeaddressable","text":"","title":"KnativeAddressable"},{"location":"sdk_docs/docs/KnativeAddressable/#properties","text":"Name Type Description Notes url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeCondition/","text":"KnativeCondition \u00b6 Conditions defines a readiness condition for a Knative resource. See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties Properties \u00b6 Name Type Description Notes last_transition_time KnativeVolatileTime LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). [optional] message str A human readable message indicating details about the transition. [optional] reason str The reason for the condition's last transition. [optional] severity str Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. [optional] status str Status of the condition, one of True, False, Unknown. type str Type of condition. [Back to Model list] [Back to API list] [Back to README]","title":"KnativeCondition"},{"location":"sdk_docs/docs/KnativeCondition/#knativecondition","text":"Conditions defines a readiness condition for a Knative resource. See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties","title":"KnativeCondition"},{"location":"sdk_docs/docs/KnativeCondition/#properties","text":"Name Type Description Notes last_transition_time KnativeVolatileTime LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). [optional] message str A human readable message indicating details about the transition. [optional] reason str The reason for the condition's last transition. [optional] severity str Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. [optional] status str Status of the condition, one of True, False, Unknown. type str Type of condition. [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeStatus/","text":"KnativeStatus \u00b6 Properties \u00b6 Name Type Description Notes conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"KnativeStatus"},{"location":"sdk_docs/docs/KnativeStatus/#knativestatus","text":"","title":"KnativeStatus"},{"location":"sdk_docs/docs/KnativeStatus/#properties","text":"Name Type Description Notes conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeURL/","text":"KnativeURL \u00b6 URL is an alias of url.URL. It has custom json marshal methods that enable it to be used in K8s CRDs such that the CRD resource will have the URL but operator code can can work with url.URL struct Properties \u00b6 Name Type Description Notes force_query bool encoded path hint (see EscapedPath method) fragment str encoded query values, without '?' host str username and password information opaque str path str host or host:port raw_path str path (relative paths may omit leading slash) raw_query str append a query ('?') even if RawQuery is empty scheme str user NetUrlUserinfo encoded opaque data [Back to Model list] [Back to API list] [Back to README]","title":"KnativeURL"},{"location":"sdk_docs/docs/KnativeURL/#knativeurl","text":"URL is an alias of url.URL. It has custom json marshal methods that enable it to be used in K8s CRDs such that the CRD resource will have the URL but operator code can can work with url.URL struct","title":"KnativeURL"},{"location":"sdk_docs/docs/KnativeURL/#properties","text":"Name Type Description Notes force_query bool encoded path hint (see EscapedPath method) fragment str encoded query values, without '?' host str username and password information opaque str path str host or host:port raw_path str path (relative paths may omit leading slash) raw_query str append a query ('?') even if RawQuery is empty scheme str user NetUrlUserinfo encoded opaque data [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeVolatileTime/","text":"KnativeVolatileTime \u00b6 VolatileTime wraps metav1.Time Properties \u00b6 Name Type Description Notes time datetime [Back to Model list] [Back to API list] [Back to README]","title":"KnativeVolatileTime"},{"location":"sdk_docs/docs/KnativeVolatileTime/#knativevolatiletime","text":"VolatileTime wraps metav1.Time","title":"KnativeVolatileTime"},{"location":"sdk_docs/docs/KnativeVolatileTime/#properties","text":"Name Type Description Notes time datetime [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/NetUrlUserinfo/","text":"NetUrlUserinfo \u00b6 Properties \u00b6 Name Type Description Notes password str password_set bool username str [Back to Model list] [Back to API list] [Back to README]","title":"NetUrlUserinfo"},{"location":"sdk_docs/docs/NetUrlUserinfo/#neturluserinfo","text":"","title":"NetUrlUserinfo"},{"location":"sdk_docs/docs/NetUrlUserinfo/#properties","text":"Name Type Description Notes password str password_set bool username str [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1Time/","text":"V1Time \u00b6 Properties \u00b6 Name Type Description Notes [Back to Model list] [Back to API list] [Back to README]","title":"V1Time"},{"location":"sdk_docs/docs/V1Time/#v1time","text":"","title":"V1Time"},{"location":"sdk_docs/docs/V1Time/#properties","text":"Name Type Description Notes [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/","text":"V1alpha1BuiltInAdapter \u00b6 Properties \u00b6 Name Type Description Notes env list[V1EnvVar] Environment variables used to control other aspects of the built-in adapter's behaviour (uncommon) [optional] mem_buffer_bytes int Fixed memory overhead to subtract from runtime container's memory allocation to determine model capacity [optional] model_loading_timeout_millis int Timeout for model loading operations in milliseconds [optional] runtime_management_port int Port which the runtime server listens for model management requests [optional] server_type str ServerType must be one of the supported built-in types such as &quot;triton&quot; or &quot;mlserver&quot;, and the runtime's container must have the same name [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1BuiltInAdapter"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/#v1alpha1builtinadapter","text":"","title":"V1alpha1BuiltInAdapter"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/#properties","text":"Name Type Description Notes env list[V1EnvVar] Environment variables used to control other aspects of the built-in adapter's behaviour (uncommon) [optional] mem_buffer_bytes int Fixed memory overhead to subtract from runtime container's memory allocation to determine model capacity [optional] model_loading_timeout_millis int Timeout for model loading operations in milliseconds [optional] runtime_management_port int Port which the runtime server listens for model management requests [optional] server_type str ServerType must be one of the supported built-in types such as &quot;triton&quot; or &quot;mlserver&quot;, and the runtime's container must have the same name [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/","text":"V1alpha1ClusterServingRuntime \u00b6 ClusterServingRuntime is the Schema for the servingruntimes API Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ClusterServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/#v1alpha1clusterservingruntime","text":"ClusterServingRuntime is the Schema for the servingruntimes API","title":"V1alpha1ClusterServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/","text":"V1alpha1ClusterServingRuntimeList \u00b6 ServingRuntimeList contains a list of ServingRuntime Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ClusterServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ClusterServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/#v1alpha1clusterservingruntimelist","text":"ServingRuntimeList contains a list of ServingRuntime","title":"V1alpha1ClusterServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ClusterServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1Container/","text":"V1alpha1Container \u00b6 Properties \u00b6 Name Type Description Notes args list[str] [optional] command list[str] [optional] env list[V1EnvVar] [optional] image str [optional] image_pull_policy str [optional] liveness_probe V1Probe [optional] name str [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] working_dir str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1Container"},{"location":"sdk_docs/docs/V1alpha1Container/#v1alpha1container","text":"","title":"V1alpha1Container"},{"location":"sdk_docs/docs/V1alpha1Container/#properties","text":"Name Type Description Notes args list[str] [optional] command list[str] [optional] env list[V1EnvVar] [optional] image str [optional] image_pull_policy str [optional] liveness_probe V1Probe [optional] name str [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] working_dir str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/","text":"V1alpha1InferenceGraph \u00b6 InferenceGraph is the Schema for the InferenceGraph API for multiple models Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1InferenceGraphSpec [optional] status V1alpha1InferenceGraphStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraph"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/#v1alpha1inferencegraph","text":"InferenceGraph is the Schema for the InferenceGraph API for multiple models","title":"V1alpha1InferenceGraph"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1InferenceGraphSpec [optional] status V1alpha1InferenceGraphStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/","text":"V1alpha1InferenceGraphList \u00b6 InferenceGraphList contains a list of InferenceGraph Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1InferenceGraph] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphList"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/#v1alpha1inferencegraphlist","text":"InferenceGraphList contains a list of InferenceGraph","title":"V1alpha1InferenceGraphList"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1InferenceGraph] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/","text":"V1alpha1InferenceGraphSpec \u00b6 InferenceGraphSpec defines the InferenceGraph spec Properties \u00b6 Name Type Description Notes nodes dict(str, V1alpha1InferenceRouter) Map of InferenceGraph router nodes Each node defines the router which can be different routing types [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphSpec"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/#v1alpha1inferencegraphspec","text":"InferenceGraphSpec defines the InferenceGraph spec","title":"V1alpha1InferenceGraphSpec"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/#properties","text":"Name Type Description Notes nodes dict(str, V1alpha1InferenceRouter) Map of InferenceGraph router nodes Each node defines the router which can be different routing types [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/","text":"V1alpha1InferenceGraphStatus \u00b6 InferenceGraphStatus defines the InferenceGraph conditions and status Properties \u00b6 Name Type Description Notes annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphStatus"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/#v1alpha1inferencegraphstatus","text":"InferenceGraphStatus defines the InferenceGraph conditions and status","title":"V1alpha1InferenceGraphStatus"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/#properties","text":"Name Type Description Notes annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/","text":"V1alpha1InferenceRouter \u00b6 InferenceRouter defines the router for each InferenceGraph node with one or multiple steps yaml kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 yaml kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \\\"{ .input.userId == 1 }\\\" - service: mymodel2 condition: \\\"{ .input.userId == 2 }\\\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \"majority-vote\" method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. yaml kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. yaml kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. yaml kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \\\"dog\\\" } - service: cat-breed-classifier condition: { .predictions.class == \\\"cat\\\" } Properties \u00b6 Name Type Description Notes router_type str RouterType - `Sequence:` chain multiple inference steps with input/output from previous step - `Splitter:` randomly routes to the target service according to the weight - `Ensemble:` routes the request to multiple models and then merge the responses - `Switch:` routes the request to one of the steps based on condition [default to ''] steps list[V1alpha1InferenceStep] Steps defines destinations for the current router node [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceRouter"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/#v1alpha1inferencerouter","text":"InferenceRouter defines the router for each InferenceGraph node with one or multiple steps yaml kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 yaml kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \\\"{ .input.userId == 1 }\\\" - service: mymodel2 condition: \\\"{ .input.userId == 2 }\\\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \"majority-vote\" method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. yaml kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. yaml kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. yaml kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \\\"dog\\\" } - service: cat-breed-classifier condition: { .predictions.class == \\\"cat\\\" }","title":"V1alpha1InferenceRouter"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/#properties","text":"Name Type Description Notes router_type str RouterType - `Sequence:` chain multiple inference steps with input/output from previous step - `Splitter:` randomly routes to the target service according to the weight - `Ensemble:` routes the request to multiple models and then merge the responses - `Switch:` routes the request to one of the steps based on condition [default to ''] steps list[V1alpha1InferenceStep] Steps defines destinations for the current router node [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/","text":"V1alpha1InferenceStep \u00b6 InferenceStep defines the inference target of the current step with condition, weights and data. Properties \u00b6 Name Type Description Notes condition str routing based on the condition [optional] data str request data sent to the next route with input/output from the previous step $request $response.predictions [optional] name str Unique name for the step within this node [optional] node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] weight int the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceStep"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/#v1alpha1inferencestep","text":"InferenceStep defines the inference target of the current step with condition, weights and data.","title":"V1alpha1InferenceStep"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/#properties","text":"Name Type Description Notes condition str routing based on the condition [optional] data str request data sent to the next route with input/output from the previous step $request $response.predictions [optional] name str Unique name for the step within this node [optional] node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] weight int the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/","text":"V1alpha1InferenceTarget \u00b6 Exactly one InferenceTarget field must be specified Properties \u00b6 Name Type Description Notes node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceTarget"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/#v1alpha1inferencetarget","text":"Exactly one InferenceTarget field must be specified","title":"V1alpha1InferenceTarget"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/#properties","text":"Name Type Description Notes node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/","text":"V1alpha1ServingRuntime \u00b6 ServingRuntime is the Schema for the servingruntimes API Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/#v1alpha1servingruntime","text":"ServingRuntime is the Schema for the servingruntimes API","title":"V1alpha1ServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/","text":"V1alpha1ServingRuntimeList \u00b6 ServingRuntimeList contains a list of ServingRuntime Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/#v1alpha1servingruntimelist","text":"ServingRuntimeList contains a list of ServingRuntime","title":"V1alpha1ServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/","text":"V1alpha1ServingRuntimePodSpec \u00b6 Properties \u00b6 Name Type Description Notes affinity V1Affinity [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimePodSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/#v1alpha1servingruntimepodspec","text":"","title":"V1alpha1ServingRuntimePodSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/#properties","text":"Name Type Description Notes affinity V1Affinity [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/","text":"V1alpha1ServingRuntimeSpec \u00b6 ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out. Properties \u00b6 Name Type Description Notes affinity V1Affinity [optional] built_in_adapter V1alpha1BuiltInAdapter [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. disabled bool Set to true to disable use of this runtime [optional] grpc_data_endpoint str Grpc endpoint for inferencing [optional] grpc_endpoint str Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted [optional] http_data_endpoint str HTTP endpoint for inferencing [optional] multi_model bool Whether this ServingRuntime is intended for multi-model usage or not. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] protocol_versions list[str] Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] replicas int Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value [optional] storage_helper V1alpha1StorageHelper [optional] supported_model_formats list[V1alpha1SupportedModelFormat] Model formats and version supported by this runtime [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimeSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/#v1alpha1servingruntimespec","text":"ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out.","title":"V1alpha1ServingRuntimeSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/#properties","text":"Name Type Description Notes affinity V1Affinity [optional] built_in_adapter V1alpha1BuiltInAdapter [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. disabled bool Set to true to disable use of this runtime [optional] grpc_data_endpoint str Grpc endpoint for inferencing [optional] grpc_endpoint str Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted [optional] http_data_endpoint str HTTP endpoint for inferencing [optional] multi_model bool Whether this ServingRuntime is intended for multi-model usage or not. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] protocol_versions list[str] Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] replicas int Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value [optional] storage_helper V1alpha1StorageHelper [optional] supported_model_formats list[V1alpha1SupportedModelFormat] Model formats and version supported by this runtime [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/","text":"V1alpha1StorageHelper \u00b6 Properties \u00b6 Name Type Description Notes disabled bool [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1StorageHelper"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/#v1alpha1storagehelper","text":"","title":"V1alpha1StorageHelper"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/#properties","text":"Name Type Description Notes disabled bool [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/","text":"V1alpha1SupportedModelFormat \u00b6 Properties \u00b6 Name Type Description Notes auto_select bool Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. [optional] name str Name of the model format. optional version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be &quot;major&quot;, &quot;major.minor&quot; or &quot;major.minor.patch&quot;. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1SupportedModelFormat"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/#v1alpha1supportedmodelformat","text":"","title":"V1alpha1SupportedModelFormat"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/#properties","text":"Name Type Description Notes auto_select bool Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. [optional] name str Name of the model format. optional version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be &quot;major&quot;, &quot;major.minor&quot; or &quot;major.minor.patch&quot;. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/","text":"V1beta1AIXExplainerSpec \u00b6 AIXExplainerSpec defines the arguments for configuring an AIX Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of AIX explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1AIXExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/#v1beta1aixexplainerspec","text":"AIXExplainerSpec defines the arguments for configuring an AIX Explanation Server","title":"V1beta1AIXExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of AIX explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/","text":"V1beta1ARTExplainerSpec \u00b6 ARTExplainerType defines the arguments for configuring an ART Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of ART explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ARTExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/#v1beta1artexplainerspec","text":"ARTExplainerType defines the arguments for configuring an ART Explanation Server","title":"V1beta1ARTExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of ART explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/","text":"V1beta1AlibiExplainerSpec \u00b6 AlibiExplainerSpec defines the arguments for configuring an Alibi Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of Alibi explainer <br /> Valid values are: <br /> - &quot;AnchorTabular&quot;; <br /> - &quot;AnchorImages&quot;; <br /> - &quot;AnchorText&quot;; <br /> - &quot;Counterfactuals&quot;; <br /> - &quot;Contrastive&quot;; <br /> [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1AlibiExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/#v1beta1alibiexplainerspec","text":"AlibiExplainerSpec defines the arguments for configuring an Alibi Explanation Server","title":"V1beta1AlibiExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of Alibi explainer <br /> Valid values are: <br /> - &quot;AnchorTabular&quot;; <br /> - &quot;AnchorImages&quot;; <br /> - &quot;AnchorText&quot;; <br /> - &quot;Counterfactuals&quot;; <br /> - &quot;Contrastive&quot;; <br /> [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1Batcher/","text":"V1beta1Batcher \u00b6 Batcher specifies optional payload batching available for all components Properties \u00b6 Name Type Description Notes max_batch_size int Specifies the max number of requests to trigger a batch [optional] max_latency int Specifies the max latency to trigger a batch [optional] timeout int Specifies the timeout of a batch [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1Batcher"},{"location":"sdk_docs/docs/V1beta1Batcher/#v1beta1batcher","text":"Batcher specifies optional payload batching available for all components","title":"V1beta1Batcher"},{"location":"sdk_docs/docs/V1beta1Batcher/#properties","text":"Name Type Description Notes max_batch_size int Specifies the max number of requests to trigger a batch [optional] max_latency int Specifies the max latency to trigger a batch [optional] timeout int Specifies the timeout of a batch [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/","text":"V1beta1ComponentExtensionSpec \u00b6 ComponentExtensionSpec defines the deployment configuration for a given InferenceService component Properties \u00b6 Name Type Description Notes batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ComponentExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/#v1beta1componentextensionspec","text":"ComponentExtensionSpec defines the deployment configuration for a given InferenceService component","title":"V1beta1ComponentExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/#properties","text":"Name Type Description Notes batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/","text":"V1beta1ComponentStatusSpec \u00b6 ComponentStatusSpec describes the state of the component Properties \u00b6 Name Type Description Notes address KnativeAddressable [optional] grpc_url KnativeURL [optional] latest_created_revision str Latest revision name that is created [optional] latest_ready_revision str Latest revision name that is in ready state [optional] latest_rolledout_revision str Latest revision name that is rolled out with 100 percent traffic [optional] previous_rolledout_revision str Previous revision name that is rolled out with 100 percent traffic [optional] rest_url KnativeURL [optional] traffic list[KnativeDevServingPkgApisServingV1TrafficTarget] Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ComponentStatusSpec"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/#v1beta1componentstatusspec","text":"ComponentStatusSpec describes the state of the component","title":"V1beta1ComponentStatusSpec"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/#properties","text":"Name Type Description Notes address KnativeAddressable [optional] grpc_url KnativeURL [optional] latest_created_revision str Latest revision name that is created [optional] latest_ready_revision str Latest revision name that is in ready state [optional] latest_rolledout_revision str Latest revision name that is rolled out with 100 percent traffic [optional] previous_rolledout_revision str Previous revision name that is rolled out with 100 percent traffic [optional] rest_url KnativeURL [optional] traffic list[KnativeDevServingPkgApisServingV1TrafficTarget] Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/","text":"V1beta1CustomExplainer \u00b6 CustomExplainer defines arguments for configuring a custom explainer. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomExplainer"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/#v1beta1customexplainer","text":"CustomExplainer defines arguments for configuring a custom explainer.","title":"V1beta1CustomExplainer"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/","text":"V1beta1CustomPredictor \u00b6 CustomPredictor defines arguments for configuring a custom server. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomPredictor"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/#v1beta1custompredictor","text":"CustomPredictor defines arguments for configuring a custom server.","title":"V1beta1CustomPredictor"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/","text":"V1beta1CustomTransformer \u00b6 CustomTransformer defines arguments for configuring a custom transformer. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomTransformer"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/#v1beta1customtransformer","text":"CustomTransformer defines arguments for configuring a custom transformer.","title":"V1beta1CustomTransformer"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/","text":"V1beta1ExplainerConfig \u00b6 Properties \u00b6 Name Type Description Notes default_image_version str default explainer docker image version [default to ''] image str explainer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerConfig"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/#v1beta1explainerconfig","text":"","title":"V1beta1ExplainerConfig"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/#properties","text":"Name Type Description Notes default_image_version str default explainer docker image version [default to ''] image str explainer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/","text":"V1beta1ExplainerExtensionSpec \u00b6 ExplainerExtensionSpec defines configuration shared across all explainer frameworks Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/#v1beta1explainerextensionspec","text":"ExplainerExtensionSpec defines configuration shared across all explainer frameworks","title":"V1beta1ExplainerExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/","text":"V1beta1ExplainerSpec \u00b6 ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] aix V1beta1AIXExplainerSpec [optional] alibi V1beta1AlibiExplainerSpec [optional] art V1beta1ARTExplainerSpec [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/#v1beta1explainerspec","text":"ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec.","title":"V1beta1ExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] aix V1beta1AIXExplainerSpec [optional] alibi V1beta1AlibiExplainerSpec [optional] art V1beta1ARTExplainerSpec [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainersConfig/","text":"V1beta1ExplainersConfig \u00b6 Properties \u00b6 Name Type Description Notes aix V1beta1ExplainerConfig [optional] alibi V1beta1ExplainerConfig [optional] art V1beta1ExplainerConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainersConfig"},{"location":"sdk_docs/docs/V1beta1ExplainersConfig/#v1beta1explainersconfig","text":"","title":"V1beta1ExplainersConfig"},{"location":"sdk_docs/docs/V1beta1ExplainersConfig/#properties","text":"Name Type Description Notes aix V1beta1ExplainerConfig [optional] alibi V1beta1ExplainerConfig [optional] art V1beta1ExplainerConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1FailureInfo/","text":"V1beta1FailureInfo \u00b6 Properties \u00b6 Name Type Description Notes location str Name of component to which the failure relates (usually Pod name) [optional] message str Detailed error message [optional] model_revision_name str Internal Revision/ID of model, tied to specific Spec contents [optional] reason str High level class of failure [optional] time V1Time [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1FailureInfo"},{"location":"sdk_docs/docs/V1beta1FailureInfo/#v1beta1failureinfo","text":"","title":"V1beta1FailureInfo"},{"location":"sdk_docs/docs/V1beta1FailureInfo/#properties","text":"Name Type Description Notes location str Name of component to which the failure relates (usually Pod name) [optional] message str Detailed error message [optional] model_revision_name str Internal Revision/ID of model, tied to specific Spec contents [optional] reason str High level class of failure [optional] time V1Time [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceService/","text":"V1beta1InferenceService \u00b6 InferenceService is the Schema for the InferenceServices API Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1beta1InferenceServiceSpec [optional] status V1beta1InferenceServiceStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceService"},{"location":"sdk_docs/docs/V1beta1InferenceService/#v1beta1inferenceservice","text":"InferenceService is the Schema for the InferenceServices API","title":"V1beta1InferenceService"},{"location":"sdk_docs/docs/V1beta1InferenceService/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1beta1InferenceServiceSpec [optional] status V1beta1InferenceServiceStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceServiceList/","text":"V1beta1InferenceServiceList \u00b6 InferenceServiceList contains a list of Service Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1beta1InferenceService] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceServiceList"},{"location":"sdk_docs/docs/V1beta1InferenceServiceList/#v1beta1inferenceservicelist","text":"InferenceServiceList contains a list of Service","title":"V1beta1InferenceServiceList"},{"location":"sdk_docs/docs/V1beta1InferenceServiceList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1beta1InferenceService] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceServiceSpec/","text":"V1beta1InferenceServiceSpec \u00b6 InferenceServiceSpec is the top level type for this resource Properties \u00b6 Name Type Description Notes explainer V1beta1ExplainerSpec [optional] predictor V1beta1PredictorSpec transformer V1beta1TransformerSpec [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceServiceSpec"},{"location":"sdk_docs/docs/V1beta1InferenceServiceSpec/#v1beta1inferenceservicespec","text":"InferenceServiceSpec is the top level type for this resource","title":"V1beta1InferenceServiceSpec"},{"location":"sdk_docs/docs/V1beta1InferenceServiceSpec/#properties","text":"Name Type Description Notes explainer V1beta1ExplainerSpec [optional] predictor V1beta1PredictorSpec transformer V1beta1TransformerSpec [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceServiceStatus/","text":"V1beta1InferenceServiceStatus \u00b6 InferenceServiceStatus defines the observed state of InferenceService Properties \u00b6 Name Type Description Notes address KnativeAddressable [optional] annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] components dict(str, V1beta1ComponentStatusSpec) Statuses for the components of the InferenceService [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] model_status V1beta1ModelStatus [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceServiceStatus"},{"location":"sdk_docs/docs/V1beta1InferenceServiceStatus/#v1beta1inferenceservicestatus","text":"InferenceServiceStatus defines the observed state of InferenceService","title":"V1beta1InferenceServiceStatus"},{"location":"sdk_docs/docs/V1beta1InferenceServiceStatus/#properties","text":"Name Type Description Notes address KnativeAddressable [optional] annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] components dict(str, V1beta1ComponentStatusSpec) Statuses for the components of the InferenceService [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] model_status V1beta1ModelStatus [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1InferenceServicesConfig/","text":"V1beta1InferenceServicesConfig \u00b6 Properties \u00b6 Name Type Description Notes explainers V1beta1ExplainersConfig predictors V1beta1PredictorsConfig transformers V1beta1TransformersConfig [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1InferenceServicesConfig"},{"location":"sdk_docs/docs/V1beta1InferenceServicesConfig/#v1beta1inferenceservicesconfig","text":"","title":"V1beta1InferenceServicesConfig"},{"location":"sdk_docs/docs/V1beta1InferenceServicesConfig/#properties","text":"Name Type Description Notes explainers V1beta1ExplainersConfig predictors V1beta1PredictorsConfig transformers V1beta1TransformersConfig [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1IngressConfig/","text":"V1beta1IngressConfig \u00b6 Properties \u00b6 Name Type Description Notes domain_template str [optional] ingress_class_name str [optional] ingress_domain str [optional] ingress_gateway str [optional] ingress_service str [optional] local_gateway str [optional] local_gateway_service str [optional] url_scheme str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1IngressConfig"},{"location":"sdk_docs/docs/V1beta1IngressConfig/#v1beta1ingressconfig","text":"","title":"V1beta1IngressConfig"},{"location":"sdk_docs/docs/V1beta1IngressConfig/#properties","text":"Name Type Description Notes domain_template str [optional] ingress_class_name str [optional] ingress_domain str [optional] ingress_gateway str [optional] ingress_service str [optional] local_gateway str [optional] local_gateway_service str [optional] url_scheme str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1LightGBMSpec/","text":"V1beta1LightGBMSpec \u00b6 LightGBMSpec defines arguments for configuring LightGBMSpec model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1LightGBMSpec"},{"location":"sdk_docs/docs/V1beta1LightGBMSpec/#v1beta1lightgbmspec","text":"LightGBMSpec defines arguments for configuring LightGBMSpec model serving.","title":"V1beta1LightGBMSpec"},{"location":"sdk_docs/docs/V1beta1LightGBMSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1LoggerSpec/","text":"V1beta1LoggerSpec \u00b6 LoggerSpec specifies optional payload logging available for all components Properties \u00b6 Name Type Description Notes mode str Specifies the scope of the loggers. <br /> Valid values are: <br /> - &quot;all&quot; (default): log both request and response; <br /> - &quot;request&quot;: log only request; <br /> - &quot;response&quot;: log only response <br /> [optional] url str URL to send logging events [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1LoggerSpec"},{"location":"sdk_docs/docs/V1beta1LoggerSpec/#v1beta1loggerspec","text":"LoggerSpec specifies optional payload logging available for all components","title":"V1beta1LoggerSpec"},{"location":"sdk_docs/docs/V1beta1LoggerSpec/#properties","text":"Name Type Description Notes mode str Specifies the scope of the loggers. <br /> Valid values are: <br /> - &quot;all&quot; (default): log both request and response; <br /> - &quot;request&quot;: log only request; <br /> - &quot;response&quot;: log only response <br /> [optional] url str URL to send logging events [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelCopies/","text":"V1beta1ModelCopies \u00b6 Properties \u00b6 Name Type Description Notes failed_copies int How many copies of this predictor's models failed to load recently [default to 0] total_copies int Total number copies of this predictor's models that are currently loaded [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelCopies"},{"location":"sdk_docs/docs/V1beta1ModelCopies/#v1beta1modelcopies","text":"","title":"V1beta1ModelCopies"},{"location":"sdk_docs/docs/V1beta1ModelCopies/#properties","text":"Name Type Description Notes failed_copies int How many copies of this predictor's models failed to load recently [default to 0] total_copies int Total number copies of this predictor's models that are currently loaded [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelFormat/","text":"V1beta1ModelFormat \u00b6 Properties \u00b6 Name Type Description Notes name str Name of the model format. optional version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be &quot;major&quot;, &quot;major.minor&quot; or &quot;major.minor.patch&quot;. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelFormat"},{"location":"sdk_docs/docs/V1beta1ModelFormat/#v1beta1modelformat","text":"","title":"V1beta1ModelFormat"},{"location":"sdk_docs/docs/V1beta1ModelFormat/#properties","text":"Name Type Description Notes name str Name of the model format. optional version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be &quot;major&quot;, &quot;major.minor&quot; or &quot;major.minor.patch&quot;. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelRevisionStates/","text":"V1beta1ModelRevisionStates \u00b6 Properties \u00b6 Name Type Description Notes active_model_state str High level state string: Pending, Standby, Loading, Loaded, FailedToLoad [default to ''] target_model_state str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelRevisionStates"},{"location":"sdk_docs/docs/V1beta1ModelRevisionStates/#v1beta1modelrevisionstates","text":"","title":"V1beta1ModelRevisionStates"},{"location":"sdk_docs/docs/V1beta1ModelRevisionStates/#properties","text":"Name Type Description Notes active_model_state str High level state string: Pending, Standby, Loading, Loaded, FailedToLoad [default to ''] target_model_state str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelSpec/","text":"V1beta1ModelSpec \u00b6 Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] model_format V1beta1ModelFormat name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime str Specific ClusterServingRuntime/ServingRuntime name to use for deployment. [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelSpec"},{"location":"sdk_docs/docs/V1beta1ModelSpec/#v1beta1modelspec","text":"","title":"V1beta1ModelSpec"},{"location":"sdk_docs/docs/V1beta1ModelSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] model_format V1beta1ModelFormat name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime str Specific ClusterServingRuntime/ServingRuntime name to use for deployment. [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ModelStatus/","text":"V1beta1ModelStatus \u00b6 Properties \u00b6 Name Type Description Notes copies V1beta1ModelCopies [optional] last_failure_info V1beta1FailureInfo [optional] states V1beta1ModelRevisionStates [optional] transition_status str Whether the available predictor endpoints reflect the current Spec or is in transition [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ModelStatus"},{"location":"sdk_docs/docs/V1beta1ModelStatus/#v1beta1modelstatus","text":"","title":"V1beta1ModelStatus"},{"location":"sdk_docs/docs/V1beta1ModelStatus/#properties","text":"Name Type Description Notes copies V1beta1ModelCopies [optional] last_failure_info V1beta1FailureInfo [optional] states V1beta1ModelRevisionStates [optional] transition_status str Whether the available predictor endpoints reflect the current Spec or is in transition [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ONNXRuntimeSpec/","text":"V1beta1ONNXRuntimeSpec \u00b6 ONNXRuntimeSpec defines arguments for configuring ONNX model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ONNXRuntimeSpec"},{"location":"sdk_docs/docs/V1beta1ONNXRuntimeSpec/#v1beta1onnxruntimespec","text":"ONNXRuntimeSpec defines arguments for configuring ONNX model serving.","title":"V1beta1ONNXRuntimeSpec"},{"location":"sdk_docs/docs/V1beta1ONNXRuntimeSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PMMLSpec/","text":"V1beta1PMMLSpec \u00b6 PMMLSpec defines arguments for configuring PMML model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PMMLSpec"},{"location":"sdk_docs/docs/V1beta1PMMLSpec/#v1beta1pmmlspec","text":"PMMLSpec defines arguments for configuring PMML model serving.","title":"V1beta1PMMLSpec"},{"location":"sdk_docs/docs/V1beta1PMMLSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PaddleServerSpec/","text":"V1beta1PaddleServerSpec \u00b6 Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PaddleServerSpec"},{"location":"sdk_docs/docs/V1beta1PaddleServerSpec/#v1beta1paddleserverspec","text":"","title":"V1beta1PaddleServerSpec"},{"location":"sdk_docs/docs/V1beta1PaddleServerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PodSpec/","text":"V1beta1PodSpec \u00b6 PodSpec is a description of a pod. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PodSpec"},{"location":"sdk_docs/docs/V1beta1PodSpec/#v1beta1podspec","text":"PodSpec is a description of a pod.","title":"V1beta1PodSpec"},{"location":"sdk_docs/docs/V1beta1PodSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorConfig/","text":"V1beta1PredictorConfig \u00b6 Properties \u00b6 Name Type Description Notes default_gpu_image_version str default predictor docker image version on gpu [default to ''] default_image_version str default predictor docker image version on cpu [default to ''] default_timeout str Default timeout of predictor for serving a request, in seconds [optional] image str predictor docker image name [default to ''] multi_model_server bool Flag to determine if multi-model serving is supported [optional] supported_frameworks list[str] frameworks the model agent is able to run [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorConfig"},{"location":"sdk_docs/docs/V1beta1PredictorConfig/#v1beta1predictorconfig","text":"","title":"V1beta1PredictorConfig"},{"location":"sdk_docs/docs/V1beta1PredictorConfig/#properties","text":"Name Type Description Notes default_gpu_image_version str default predictor docker image version on gpu [default to ''] default_image_version str default predictor docker image version on cpu [default to ''] default_timeout str Default timeout of predictor for serving a request, in seconds [optional] image str predictor docker image name [default to ''] multi_model_server bool Flag to determine if multi-model serving is supported [optional] supported_frameworks list[str] frameworks the model agent is able to run [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorExtensionSpec/","text":"V1beta1PredictorExtensionSpec \u00b6 PredictorExtensionSpec defines configuration shared across all predictor frameworks Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorExtensionSpec"},{"location":"sdk_docs/docs/V1beta1PredictorExtensionSpec/#v1beta1predictorextensionspec","text":"PredictorExtensionSpec defines configuration shared across all predictor frameworks","title":"V1beta1PredictorExtensionSpec"},{"location":"sdk_docs/docs/V1beta1PredictorExtensionSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorProtocols/","text":"V1beta1PredictorProtocols \u00b6 Properties \u00b6 Name Type Description Notes v1 V1beta1PredictorConfig [optional] v2 V1beta1PredictorConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorProtocols"},{"location":"sdk_docs/docs/V1beta1PredictorProtocols/#v1beta1predictorprotocols","text":"","title":"V1beta1PredictorProtocols"},{"location":"sdk_docs/docs/V1beta1PredictorProtocols/#properties","text":"Name Type Description Notes v1 V1beta1PredictorConfig [optional] v2 V1beta1PredictorConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorSpec/","text":"V1beta1PredictorSpec \u00b6 PredictorSpec defines the configuration for a predictor, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] lightgbm V1beta1LightGBMSpec [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] model V1beta1ModelSpec [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] onnx V1beta1ONNXRuntimeSpec [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] paddle V1beta1PaddleServerSpec [optional] pmml V1beta1PMMLSpec [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] pytorch V1beta1TorchServeSpec [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] sklearn V1beta1SKLearnSpec [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] tensorflow V1beta1TFServingSpec [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] triton V1beta1TritonSpec [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] xgboost V1beta1XGBoostSpec [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorSpec"},{"location":"sdk_docs/docs/V1beta1PredictorSpec/#v1beta1predictorspec","text":"PredictorSpec defines the configuration for a predictor, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec.","title":"V1beta1PredictorSpec"},{"location":"sdk_docs/docs/V1beta1PredictorSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] lightgbm V1beta1LightGBMSpec [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] model V1beta1ModelSpec [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] onnx V1beta1ONNXRuntimeSpec [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] paddle V1beta1PaddleServerSpec [optional] pmml V1beta1PMMLSpec [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] pytorch V1beta1TorchServeSpec [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] sklearn V1beta1SKLearnSpec [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] tensorflow V1beta1TFServingSpec [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] triton V1beta1TritonSpec [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] xgboost V1beta1XGBoostSpec [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1PredictorsConfig/","text":"V1beta1PredictorsConfig \u00b6 Properties \u00b6 Name Type Description Notes lightgbm V1beta1PredictorConfig [optional] onnx V1beta1PredictorConfig [optional] paddle V1beta1PredictorConfig [optional] pmml V1beta1PredictorConfig [optional] pytorch V1beta1PredictorConfig [optional] sklearn V1beta1PredictorProtocols [optional] tensorflow V1beta1PredictorConfig [optional] triton V1beta1PredictorConfig [optional] xgboost V1beta1PredictorProtocols [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1PredictorsConfig"},{"location":"sdk_docs/docs/V1beta1PredictorsConfig/#v1beta1predictorsconfig","text":"","title":"V1beta1PredictorsConfig"},{"location":"sdk_docs/docs/V1beta1PredictorsConfig/#properties","text":"Name Type Description Notes lightgbm V1beta1PredictorConfig [optional] onnx V1beta1PredictorConfig [optional] paddle V1beta1PredictorConfig [optional] pmml V1beta1PredictorConfig [optional] pytorch V1beta1PredictorConfig [optional] sklearn V1beta1PredictorProtocols [optional] tensorflow V1beta1PredictorConfig [optional] triton V1beta1PredictorConfig [optional] xgboost V1beta1PredictorProtocols [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1SKLearnSpec/","text":"V1beta1SKLearnSpec \u00b6 SKLearnSpec defines arguments for configuring SKLearn model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1SKLearnSpec"},{"location":"sdk_docs/docs/V1beta1SKLearnSpec/#v1beta1sklearnspec","text":"SKLearnSpec defines arguments for configuring SKLearn model serving.","title":"V1beta1SKLearnSpec"},{"location":"sdk_docs/docs/V1beta1SKLearnSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1StorageSpec/","text":"V1beta1StorageSpec \u00b6 Properties \u00b6 Name Type Description Notes key str The Storage Key in the secret for this model. [optional] parameters dict(str, str) Parameters to override the default storage credentials and config. [optional] path str The path to the model object in the storage. It cannot co-exist with the storageURI. [optional] schema_path str The path to the model schema file in the storage. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1StorageSpec"},{"location":"sdk_docs/docs/V1beta1StorageSpec/#v1beta1storagespec","text":"","title":"V1beta1StorageSpec"},{"location":"sdk_docs/docs/V1beta1StorageSpec/#properties","text":"Name Type Description Notes key str The Storage Key in the secret for this model. [optional] parameters dict(str, str) Parameters to override the default storage credentials and config. [optional] path str The path to the model object in the storage. It cannot co-exist with the storageURI. [optional] schema_path str The path to the model schema file in the storage. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TFServingSpec/","text":"V1beta1TFServingSpec \u00b6 TFServingSpec defines arguments for configuring Tensorflow model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TFServingSpec"},{"location":"sdk_docs/docs/V1beta1TFServingSpec/#v1beta1tfservingspec","text":"TFServingSpec defines arguments for configuring Tensorflow model serving.","title":"V1beta1TFServingSpec"},{"location":"sdk_docs/docs/V1beta1TFServingSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TorchServeSpec/","text":"V1beta1TorchServeSpec \u00b6 TorchServeSpec defines arguments for configuring PyTorch model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TorchServeSpec"},{"location":"sdk_docs/docs/V1beta1TorchServeSpec/#v1beta1torchservespec","text":"TorchServeSpec defines arguments for configuring PyTorch model serving.","title":"V1beta1TorchServeSpec"},{"location":"sdk_docs/docs/V1beta1TorchServeSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TransformerConfig/","text":"V1beta1TransformerConfig \u00b6 Properties \u00b6 Name Type Description Notes default_image_version str default transformer docker image version [default to ''] image str transformer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TransformerConfig"},{"location":"sdk_docs/docs/V1beta1TransformerConfig/#v1beta1transformerconfig","text":"","title":"V1beta1TransformerConfig"},{"location":"sdk_docs/docs/V1beta1TransformerConfig/#properties","text":"Name Type Description Notes default_image_version str default transformer docker image version [default to ''] image str transformer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TransformerSpec/","text":"V1beta1TransformerSpec \u00b6 TransformerSpec defines transformer service for pre/post processing Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TransformerSpec"},{"location":"sdk_docs/docs/V1beta1TransformerSpec/#v1beta1transformerspec","text":"TransformerSpec defines transformer service for pre/post processing","title":"V1beta1TransformerSpec"},{"location":"sdk_docs/docs/V1beta1TransformerSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to &quot;ClusterFirst&quot;. Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. &quot;system-node-critical&quot; and &quot;system-cluster-critical&quot; are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to &quot;True&quot; More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the &quot;legacy&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be &quot;<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>&quot;. If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TransformersConfig/","text":"V1beta1TransformersConfig \u00b6 Properties \u00b6 Name Type Description Notes feast V1beta1TransformerConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TransformersConfig"},{"location":"sdk_docs/docs/V1beta1TransformersConfig/#v1beta1transformersconfig","text":"","title":"V1beta1TransformersConfig"},{"location":"sdk_docs/docs/V1beta1TransformersConfig/#properties","text":"Name Type Description Notes feast V1beta1TransformerConfig [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1TritonSpec/","text":"V1beta1TritonSpec \u00b6 TritonSpec defines arguments for configuring Triton model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1TritonSpec"},{"location":"sdk_docs/docs/V1beta1TritonSpec/#v1beta1tritonspec","text":"TritonSpec defines arguments for configuring Triton model serving.","title":"V1beta1TritonSpec"},{"location":"sdk_docs/docs/V1beta1TritonSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1XGBoostSpec/","text":"V1beta1XGBoostSpec \u00b6 XGBoostSpec defines arguments for configuring XGBoost model serving. Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1XGBoostSpec"},{"location":"sdk_docs/docs/V1beta1XGBoostSpec/#v1beta1xgboostspec","text":"XGBoostSpec defines arguments for configuring XGBoost model serving.","title":"V1beta1XGBoostSpec"},{"location":"sdk_docs/docs/V1beta1XGBoostSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. &quot;$$(VAR_NAME)&quot; will produce the string literal &quot;$(VAR_NAME)&quot;. Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. optional ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default &quot;0.0.0.0&quot; address inside a container will be accessible from the network. Cannot be updated. [optional] protocol_version str Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Runtime version of the predictor docker image [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str This field points to the location of the trained model which is mounted onto the pod. [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"}]}
\ No newline at end of file
diff --git a/master/sitemap.xml b/master/sitemap.xml
index 418c9e8e9..675bf430b 100644
--- a/master/sitemap.xml
+++ b/master/sitemap.xml
@@ -156,6 +156,10 @@
          <loc>https://kserve.io/website/master/modelserving/certificate/kserve/</loc>
          <lastmod>2024-11-19</lastmod>
     </url>
+    <url>
+         <loc>https://kserve.io/website/master/modelserving/data_plane/binary_tensor_data_extension/</loc>
+         <lastmod>2024-11-19</lastmod>
+    </url>
     <url>
          <loc>https://kserve.io/website/master/modelserving/data_plane/data_plane/</loc>
          <lastmod>2024-11-19</lastmod>
diff --git a/master/sitemap.xml.gz b/master/sitemap.xml.gz
index d2232eccf..cf24e92bf 100644
Binary files a/master/sitemap.xml.gz and b/master/sitemap.xml.gz differ