Add tool for generating a config file from vgpuConfig.xml

vgpuConfig.xml ships with the vGPU Manager and contains a comprehensive listing of all physical GPUs and their supported vGPU types. The tool added in this commit parses vgpuConfig.xml and creates a corresponding configuration file (yaml) for the vGPU Device Manager. Consequently, this yaml file will contain a config entry for every vGPU type supported by NVIDIA vGPU. Currently, only Q- and C-series vGPU types are added to the yaml file. Signed-off-by: Christopher Desiniotis <[email protected]>
NVIDIA · Jun 28, 2024 · 2ab0b0e · 2ab0b0e
1 parent ab20f40
commit 2ab0b0e
Show file tree

Hide file tree

Showing 5 changed files with 414 additions and 0 deletions.
diff --git a/pkg/types/vgpu_type.go b/pkg/types/vgpu_type.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"regexp"
 	"strconv"
+	"strings"
 )
 
 const (
@@ -117,3 +118,15 @@ func parseRegex(re, s string) map[string]string {
 
 	return captureGroups
 }
+
+func (v VGPUType) String() string {
+	if v.G == 0 {
+		return fmt.Sprintf("%s-%d%c", v.GPU, v.GB, v.S)
+	}
+
+	var suffix string
+	if len(v.Attr) > 0 {
+		suffix = strings.Join(v.Attr, "")
+	}
+	return fmt.Sprintf("%s-%d-%d%c%s", v.GPU, v.G, v.GB, v.S, suffix)
+}
diff --git a/tools/vgpu-config/README.md b/tools/vgpu-config/README.md
@@ -0,0 +1,20 @@
+# vgpu-config
+
+A tool to generate a `vGPU Device Manager` configuration file from `vgpuConfig.xml` which ships with the `NVIDIA vGPU Manager`. On a live system, `vgpuConfig.xml` gets installed at `/usr/share/nvidia/vgpu/vgpuConfig.xml` and contains a comprehensive list of all vGPU types supported by NVIDIA vGPU.
+
+## Usage
+
+Generate a `config.yaml` file from `vgpuConfig.xml`:
+
+```
+vgpu-config generate -f vgpuConfig.xml -o config.yaml
+```
+
+For additional help:
+
+```
+vgpu-config -h
+```
+
+
+
diff --git a/tools/vgpu-config/generate.go b/tools/vgpu-config/generate.go
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"encoding/xml"
+	"fmt"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+
+	"gopkg.in/yaml.v3"
+
+	cli "github.com/urfave/cli/v2"
+
+	v1 "gitlab.com/nvidia/cloud-native/vgpu-device-manager/api/spec/v1"
+	"gitlab.com/nvidia/cloud-native/vgpu-device-manager/pkg/types"
+)
+
+const (
+	// TODO: make the default's configurable
+	defaultVGPUConfigName    = "default"
+	framebufferPolicyMax     = "max"
+	framebufferPolicyHalf    = "half"
+	framebufferPolicyMin     = "min"
+	defaultFramebufferPolicy = framebufferPolicyHalf
+)
+
+// Generate converts 'vgpuConfig.xml' into a configuration file (yaml) for the vGPU Device Manager
+func Generate(c *cli.Context, f *flags) error {
+	xmlFile, err := parseXMLFile(f)
+	if err != nil {
+		return fmt.Errorf("error parsing xml file: %v", err)
+	}
+
+	// Mapping between vGPU type id and vGPU type information in the xml file
+	idToType := map[int]VGPUType{}
+	for _, v := range xmlFile.VGPUTypes {
+		idToType[v.ID] = v
+	}
+
+	// Initialize the vGPU Device Manager configuration spec
+	spec := v1.Spec{
+		Version:     "v1",
+		VGPUConfigs: map[string]v1.VGPUConfigSpecSlice{},
+	}
+
+	// The default configuration will contain one entry per physical GPU supported
+	defaultConfig := v1.VGPUConfigSpecSlice{}
+
+	for _, p := range xmlFile.PGPUs {
+		// Mapping VGPU series to the list of supported VGPU types for the PGPU.
+		// Will use this later when picking a default vGPU type for the PGPU.
+		supportedVGPUs := map[types.Series][]*types.VGPUType{}
+		for _, v := range p.SupportedVGPUs {
+			// Only process vGPU types of class 'Quadro' or 'Compute'.
+			// This restriction may be relaxed in the future.
+			class := idToType[v.ID].Class
+			if class == "NVS" {
+				continue
+			}
+
+			_type := idToType[v.ID]
+			// Strip the leading NVIDIA|GRID from the type name
+			split := strings.SplitN(_type.Name, " ", 2)
+			if len(split) != 2 {
+				return fmt.Errorf("error splitting vGPU type name: %s", _type.Name)
+			}
+
+			vgpuType, err := types.ParseVGPUType(split[1])
+			if err != nil {
+				return fmt.Errorf("could not parse vGPU type '%s': %v", _type.Name, err)
+			}
+
+			// Add entry for this vGPU Type in the config
+			vgpuTypeStr := vgpuType.String()
+			spec.VGPUConfigs[vgpuTypeStr] = v1.VGPUConfigSpecSlice{
+				v1.VGPUConfigSpec{
+					Devices: "all",
+					VGPUDevices: types.VGPUConfig{
+						vgpuTypeStr: v.MaxVGPUs,
+					},
+				},
+			}
+
+			// Only consider non MIG-backed types later on when picking a default type for the PGPU.
+			// Note: 'G' is the number of GPU instances
+			if vgpuType.G == 0 {
+				supportedVGPUs[vgpuType.S] = append(supportedVGPUs[vgpuType.S], vgpuType)
+			}
+		}
+
+		// The below picks a default vGPU type for the PGPU. A Q-series type is selected by default
+		// unless the PGPU does not support Q-series, then C-series is used.
+		vgpuSlice := supportedVGPUs['Q']
+		if len(supportedVGPUs['Q']) == 0 && len(supportedVGPUs['C']) == 0 {
+			continue
+		}
+		if len(supportedVGPUs['Q']) == 0 {
+			vgpuSlice = supportedVGPUs['C']
+		}
+
+		defaultVGPUType, err := getDefaultVGPUType(vgpuSlice, defaultFramebufferPolicy)
+		if err != nil {
+			return fmt.Errorf("error getting default vGPU type: %v", err)
+		}
+
+		defaultName := defaultVGPUType.String()
+		numInstances := spec.VGPUConfigs[defaultName][0].VGPUDevices[defaultName]
+
+		deviceFilter, err := getDeviceFilterString(p.DeviceID)
+		if err != nil {
+			return fmt.Errorf("error getting device filter: %v", err)
+		}
+
+		// Add default config entry for the PGPU
+		defaultConfig = append(defaultConfig, v1.VGPUConfigSpec{
+			DeviceFilter: deviceFilter,
+			Devices:      "all",
+			VGPUDevices: types.VGPUConfig{
+				defaultName: numInstances,
+			},
+		})
+	}
+
+	spec.VGPUConfigs[defaultVGPUConfigName] = defaultConfig
+
+	data, err := yaml.Marshal(&spec)
+	if err != nil {
+		return fmt.Errorf("error marshalling data: %v", err)
+	}
+
+	err = os.WriteFile(f.outputFile, data, 0600)
+	if err != nil {
+		return fmt.Errorf("could not write to file: %v", err)
+	}
+	return nil
+}
+
+func parseXMLFile(f *flags) (*VGPUConfig, error) {
+	xmlFile, err := os.ReadFile(f.xmlFile)
+	if err != nil {
+		return nil, fmt.Errorf("error reading file: %v", err)
+	}
+
+	var vgpuConfig VGPUConfig
+	err = xml.Unmarshal(xmlFile, &vgpuConfig)
+	if err != nil {
+		return nil, fmt.Errorf("unmarshal error: %v", err)
+	}
+
+	return &vgpuConfig, nil
+}
+
+func stripVGPUTypeName(s string) (string, error) {
+	// Type name in the format: [NVIDIA|GRID] <vGPU type>
+	typeStr := strings.TrimSpace(s)
+	typeSplit := strings.SplitN(typeStr, " ", 2)
+	if len(typeSplit) != 2 {
+		return "", fmt.Errorf("malformed vGPU type name: %s", s)
+	}
+
+	return typeSplit[1], nil
+}
+
+func getVGPUDevicesSpec(v SupportedVGPU, idToType *map[int]VGPUType) (types.VGPUConfig, error) {
+	vgpuType := (*idToType)[v.ID]
+
+	typeName, err := stripVGPUTypeName(vgpuType.Name)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing vGPU type name for id %d: %v", v.ID, err)
+	}
+
+	vgpuDevices := types.VGPUConfig{
+		typeName: v.MaxVGPUs,
+	}
+
+	return vgpuDevices, nil
+}
+
+func getDefaultVGPUType(vgpuTypes []*types.VGPUType, policy string) (*types.VGPUType, error) {
+	// Sort in descending order by framebuffer size in GB
+	sort.Slice(vgpuTypes, func(i, j int) bool {
+		return vgpuTypes[i].GB > vgpuTypes[j].GB
+	})
+
+	if len(vgpuTypes) == 0 {
+		return nil, fmt.Errorf("no vGPU types")
+	}
+	// For GH200, there is only one valid vGPU type, GH200-96C, when MIG is not enabled
+	if len(vgpuTypes) == 1 {
+		return vgpuTypes[0], nil
+	}
+
+	halfGB := vgpuTypes[0].GB / 2
+	switch policy {
+	case framebufferPolicyMax:
+		return vgpuTypes[0], nil
+	case framebufferPolicyMin:
+		return vgpuTypes[len(vgpuTypes)-1], nil
+	case framebufferPolicyHalf:
+		for i, v := range vgpuTypes {
+			if v.GB == halfGB {
+				return vgpuTypes[i], nil
+			}
+		}
+		return nil, fmt.Errorf("error finding a vGPU type with half the max framebuffer size")
+	default:
+		return nil, fmt.Errorf("invalid policy '%s' for selecting default vGPU type", policy)
+	}
+}
+
+func getDeviceFilterString(deviceInfo DeviceID) (string, error) {
+	deviceID, err := strconv.ParseUint(deviceInfo.DeviceID, 0, 16)
+	if err != nil {
+		return "", fmt.Errorf("unable to convert device id string to uint16: %v", err)
+	}
+
+	vendorID, err := strconv.ParseUint(deviceInfo.VendorID, 0, 16)
+	if err != nil {
+		return "", fmt.Errorf("unable to convert vendor id string to uint16: %v", err)
+	}
+
+	deviceFilter := types.NewDeviceID(uint16(deviceID), uint16(vendorID))
+	return deviceFilter.String(), nil
+}
diff --git a/tools/vgpu-config/main.go b/tools/vgpu-config/main.go
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"fmt"
+	"os"
+
+	log "github.com/sirupsen/logrus"
+	cli "github.com/urfave/cli/v2"
+)
+
+type flags struct {
+	xmlFile    string
+	outputFile string
+}
+
+func main() {
+	flags := flags{}
+
+	c := cli.NewApp()
+	c.Name = "vgpu-config"
+	c.Usage = "Manage configuration files for NVIDIA vGPU Device Manager"
+	c.Version = "0.1.0"
+
+	generate := cli.Command{}
+	generate.Name = "generate"
+	generate.Usage = "Generate a vGPU device configuration file from an xml file (vgpuConfig.xml)"
+	generate.Before = func(c *cli.Context) error {
+		return validateFlags(&flags)
+	}
+	generate.Action = func(c *cli.Context) error {
+		return Generate(c, &flags)
+	}
+
+	// Register the subcommand with the top-level CLI
+	c.Commands = []*cli.Command{
+		&generate,
+	}
+
+	generate.Flags = []cli.Flag{
+		&cli.StringFlag{
+			Name:        "xml-file",
+			Aliases:     []string{"f"},
+			Usage:       "Path to the xml file",
+			Required:    true,
+			Destination: &flags.xmlFile,
+			EnvVars:     []string{"XML_FILE"},
+		},
+		&cli.StringFlag{
+			Name:        "output-file",
+			Aliases:     []string{"o"},
+			Required:    true,
+			Usage:       "Path to the output file",
+			Destination: &flags.outputFile,
+			EnvVars:     []string{"OUTPUT_FILE"},
+		},
+	}
+
+	if err := c.Run(os.Args); err != nil {
+		log.Fatal(fmt.Errorf("error: %v", err))
+	}
+}
+
+func validateFlags(f *flags) error {
+	if f.xmlFile == "" {
+		return fmt.Errorf("invalid --xml-file option: %v", f.xmlFile)
+	}
+	if f.outputFile == "" {
+		return fmt.Errorf("invalid --output-file option: %v", f.outputFile)
+	}
+
+	return nil
+}