Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Add tool for generating a config file from vgpuConfig.xml #16

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions pkg/types/vgpu_type.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"regexp"
"strconv"
"strings"
)

const (
Expand Down Expand Up @@ -117,3 +118,15 @@ func parseRegex(re, s string) map[string]string {

return captureGroups
}

func (v VGPUType) String() string {
if v.G == 0 {
return fmt.Sprintf("%s-%d%c", v.GPU, v.GB, v.S)
}

var suffix string
if len(v.Attr) > 0 {
suffix = strings.Join(v.Attr, "")
}
return fmt.Sprintf("%s-%d-%d%c%s", v.GPU, v.G, v.GB, v.S, suffix)
}
20 changes: 20 additions & 0 deletions tools/vgpu-config/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# vgpu-config

A tool to generate a `vGPU Device Manager` configuration file from `vgpuConfig.xml` which ships with the `NVIDIA vGPU Manager`. On a live system, `vgpuConfig.xml` gets installed at `/usr/share/nvidia/vgpu/vgpuConfig.xml` and contains a comprehensive list of all vGPU types supported by NVIDIA vGPU.

## Usage

Generate a `config.yaml` file from `vgpuConfig.xml`:

```
vgpu-config generate -f vgpuConfig.xml -o config.yaml
```

For additional help:

```
vgpu-config -h
```



241 changes: 241 additions & 0 deletions tools/vgpu-config/generate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"encoding/xml"
"fmt"
"os"
"sort"
"strconv"
"strings"

"gopkg.in/yaml.v3"

cli "github.com/urfave/cli/v2"

v1 "gitlab.com/nvidia/cloud-native/vgpu-device-manager/api/spec/v1"

Check failure on line 31 in tools/vgpu-config/generate.go

View workflow job for this annotation

GitHub Actions / build

cannot find module providing package gitlab.com/nvidia/cloud-native/vgpu-device-manager/api/spec/v1: import lookup disabled by -mod=vendor

Check failure on line 31 in tools/vgpu-config/generate.go

View workflow job for this annotation

GitHub Actions / Unit test

cannot find module providing package gitlab.com/nvidia/cloud-native/vgpu-device-manager/api/spec/v1: import lookup disabled by -mod=vendor
"gitlab.com/nvidia/cloud-native/vgpu-device-manager/pkg/types"

Check failure on line 32 in tools/vgpu-config/generate.go

View workflow job for this annotation

GitHub Actions / build

cannot find module providing package gitlab.com/nvidia/cloud-native/vgpu-device-manager/pkg/types: import lookup disabled by -mod=vendor

Check failure on line 32 in tools/vgpu-config/generate.go

View workflow job for this annotation

GitHub Actions / Unit test

cannot find module providing package gitlab.com/nvidia/cloud-native/vgpu-device-manager/pkg/types: import lookup disabled by -mod=vendor
)

const (
// TODO: make the default's configurable
defaultVGPUConfigName = "default"
framebufferPolicyMax = "max"
framebufferPolicyHalf = "half"
framebufferPolicyMin = "min"
defaultFramebufferPolicy = framebufferPolicyHalf
)

// Generate converts 'vgpuConfig.xml' into a configuration file (yaml) for the vGPU Device Manager
func Generate(c *cli.Context, f *flags) error {
xmlFile, err := parseXMLFile(f)
if err != nil {
return fmt.Errorf("error parsing xml file: %v", err)
}

// Mapping between vGPU type id and vGPU type information in the xml file
idToType := map[int]VGPUType{}
for _, v := range xmlFile.VGPUTypes {
idToType[v.ID] = v
}

// Initialize the vGPU Device Manager configuration spec
spec := v1.Spec{
Version: "v1",
VGPUConfigs: map[string]v1.VGPUConfigSpecSlice{},
}

// The default configuration will contain one entry per physical GPU supported
defaultConfig := v1.VGPUConfigSpecSlice{}

for _, p := range xmlFile.PGPUs {
// Mapping VGPU series to the list of supported VGPU types for the PGPU.
// Will use this later when picking a default vGPU type for the PGPU.
supportedVGPUs := map[types.Series][]*types.VGPUType{}
for _, v := range p.SupportedVGPUs {
// Only process vGPU types of class 'Quadro' or 'Compute'.
// This restriction may be relaxed in the future.
class := idToType[v.ID].Class
if class == "NVS" {
continue
}

_type := idToType[v.ID]
// Strip the leading NVIDIA|GRID from the type name
split := strings.SplitN(_type.Name, " ", 2)
if len(split) != 2 {
return fmt.Errorf("error splitting vGPU type name: %s", _type.Name)
}

vgpuType, err := types.ParseVGPUType(split[1])
if err != nil {
return fmt.Errorf("could not parse vGPU type '%s': %v", _type.Name, err)
}

// Add entry for this vGPU Type in the config
vgpuTypeStr := vgpuType.String()
spec.VGPUConfigs[vgpuTypeStr] = v1.VGPUConfigSpecSlice{
v1.VGPUConfigSpec{
Devices: "all",
VGPUDevices: types.VGPUConfig{
vgpuTypeStr: v.MaxVGPUs,
},
},
}

// Only consider non MIG-backed types later on when picking a default type for the PGPU.
// Note: 'G' is the number of GPU instances
if vgpuType.G == 0 {
supportedVGPUs[vgpuType.S] = append(supportedVGPUs[vgpuType.S], vgpuType)
}
}

// The below picks a default vGPU type for the PGPU. A Q-series type is selected by default
// unless the PGPU does not support Q-series, then C-series is used.
vgpuSlice := supportedVGPUs['Q']
if len(supportedVGPUs['Q']) == 0 && len(supportedVGPUs['C']) == 0 {
continue
}
if len(supportedVGPUs['Q']) == 0 {
vgpuSlice = supportedVGPUs['C']
}

defaultVGPUType, err := getDefaultVGPUType(vgpuSlice, defaultFramebufferPolicy)
if err != nil {
return fmt.Errorf("error getting default vGPU type: %v", err)
}

defaultName := defaultVGPUType.String()
numInstances := spec.VGPUConfigs[defaultName][0].VGPUDevices[defaultName]

deviceFilter, err := getDeviceFilterString(p.DeviceID)
if err != nil {
return fmt.Errorf("error getting device filter: %v", err)
}

// Add default config entry for the PGPU
defaultConfig = append(defaultConfig, v1.VGPUConfigSpec{
DeviceFilter: deviceFilter,
Devices: "all",
VGPUDevices: types.VGPUConfig{
defaultName: numInstances,
},
})
}

spec.VGPUConfigs[defaultVGPUConfigName] = defaultConfig

data, err := yaml.Marshal(&spec)
if err != nil {
return fmt.Errorf("error marshalling data: %v", err)
}

err = os.WriteFile(f.outputFile, data, 0600)
if err != nil {
return fmt.Errorf("could not write to file: %v", err)
}
return nil
}

func parseXMLFile(f *flags) (*VGPUConfig, error) {
xmlFile, err := os.ReadFile(f.xmlFile)
if err != nil {
return nil, fmt.Errorf("error reading file: %v", err)
}

var vgpuConfig VGPUConfig
err = xml.Unmarshal(xmlFile, &vgpuConfig)
if err != nil {
return nil, fmt.Errorf("unmarshal error: %v", err)
}

return &vgpuConfig, nil
}

func stripVGPUTypeName(s string) (string, error) {
// Type name in the format: [NVIDIA|GRID] <vGPU type>
typeStr := strings.TrimSpace(s)
typeSplit := strings.SplitN(typeStr, " ", 2)
if len(typeSplit) != 2 {
return "", fmt.Errorf("malformed vGPU type name: %s", s)
}

return typeSplit[1], nil
}

func getVGPUDevicesSpec(v SupportedVGPU, idToType *map[int]VGPUType) (types.VGPUConfig, error) {
vgpuType := (*idToType)[v.ID]

typeName, err := stripVGPUTypeName(vgpuType.Name)

Check failure on line 184 in tools/vgpu-config/generate.go

View workflow job for this annotation

GitHub Actions / check

typeName declared and not used (typecheck)
if err != nil {
return nil, fmt.Errorf("error parsing vGPU type name for id %d: %v", v.ID, err)
}

vgpuDevices := types.VGPUConfig{
typeName: v.MaxVGPUs,
}

return vgpuDevices, nil
}

func getDefaultVGPUType(vgpuTypes []*types.VGPUType, policy string) (*types.VGPUType, error) {
// Sort in descending order by framebuffer size in GB
sort.Slice(vgpuTypes, func(i, j int) bool {
return vgpuTypes[i].GB > vgpuTypes[j].GB
})

if len(vgpuTypes) == 0 {
return nil, fmt.Errorf("no vGPU types")
}
// For GH200, there is only one valid vGPU type, GH200-96C, when MIG is not enabled
if len(vgpuTypes) == 1 {
return vgpuTypes[0], nil
}

halfGB := vgpuTypes[0].GB / 2
switch policy {
case framebufferPolicyMax:
return vgpuTypes[0], nil
case framebufferPolicyMin:
return vgpuTypes[len(vgpuTypes)-1], nil
case framebufferPolicyHalf:
for i, v := range vgpuTypes {
if v.GB == halfGB {
return vgpuTypes[i], nil
}
}
return nil, fmt.Errorf("error finding a vGPU type with half the max framebuffer size")
default:
return nil, fmt.Errorf("invalid policy '%s' for selecting default vGPU type", policy)
}
}

func getDeviceFilterString(deviceInfo DeviceID) (string, error) {
deviceID, err := strconv.ParseUint(deviceInfo.DeviceID, 0, 16)
if err != nil {
return "", fmt.Errorf("unable to convert device id string to uint16: %v", err)
}

vendorID, err := strconv.ParseUint(deviceInfo.VendorID, 0, 16)
if err != nil {
return "", fmt.Errorf("unable to convert vendor id string to uint16: %v", err)
}

deviceFilter := types.NewDeviceID(uint16(deviceID), uint16(vendorID))
return deviceFilter.String(), nil
}
88 changes: 88 additions & 0 deletions tools/vgpu-config/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright (c) NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"fmt"
"os"

log "github.com/sirupsen/logrus"
cli "github.com/urfave/cli/v2"
)

type flags struct {
xmlFile string
outputFile string
}

func main() {
flags := flags{}

c := cli.NewApp()
c.Name = "vgpu-config"
c.Usage = "Manage configuration files for NVIDIA vGPU Device Manager"
c.Version = "0.1.0"

generate := cli.Command{}
generate.Name = "generate"
generate.Usage = "Generate a vGPU device configuration file from an xml file (vgpuConfig.xml)"
generate.Before = func(c *cli.Context) error {
return validateFlags(&flags)
}
generate.Action = func(c *cli.Context) error {
return Generate(c, &flags)
}

// Register the subcommand with the top-level CLI
c.Commands = []*cli.Command{
&generate,
}

generate.Flags = []cli.Flag{
&cli.StringFlag{
Name: "xml-file",
Aliases: []string{"f"},
Usage: "Path to the xml file",
Required: true,
Destination: &flags.xmlFile,
EnvVars: []string{"XML_FILE"},
},
&cli.StringFlag{
Name: "output-file",
Aliases: []string{"o"},
Required: true,
Usage: "Path to the output file",
Destination: &flags.outputFile,
EnvVars: []string{"OUTPUT_FILE"},
},
}

if err := c.Run(os.Args); err != nil {
log.Fatal(fmt.Errorf("error: %v", err))
}
}

func validateFlags(f *flags) error {
if f.xmlFile == "" {
return fmt.Errorf("invalid --xml-file option: %v", f.xmlFile)
}
if f.outputFile == "" {
return fmt.Errorf("invalid --output-file option: %v", f.outputFile)
}

return nil
}
Loading
Loading