-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add tool for generating a config file from vgpuConfig.xml
vgpuConfig.xml ships with the vGPU Manager and contains a comprehensive listing of all physical GPUs and their supported vGPU types. The tool added in this commit parses vgpuConfig.xml and creates a corresponding configuration file (yaml) for the vGPU Device Manager. Consequently, this yaml file will contain a config entry for every vGPU type supported by NVIDIA vGPU. Currently, only Q- and C-series vGPU types are added to the yaml file. Signed-off-by: Christopher Desiniotis <[email protected]>
- Loading branch information
1 parent
ab20f40
commit 2ab0b0e
Showing
5 changed files
with
414 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# vgpu-config | ||
|
||
A tool to generate a `vGPU Device Manager` configuration file from `vgpuConfig.xml` which ships with the `NVIDIA vGPU Manager`. On a live system, `vgpuConfig.xml` gets installed at `/usr/share/nvidia/vgpu/vgpuConfig.xml` and contains a comprehensive list of all vGPU types supported by NVIDIA vGPU. | ||
|
||
## Usage | ||
|
||
Generate a `config.yaml` file from `vgpuConfig.xml`: | ||
|
||
``` | ||
vgpu-config generate -f vgpuConfig.xml -o config.yaml | ||
``` | ||
|
||
For additional help: | ||
|
||
``` | ||
vgpu-config -h | ||
``` | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"encoding/xml" | ||
"fmt" | ||
"os" | ||
"sort" | ||
"strconv" | ||
"strings" | ||
|
||
"gopkg.in/yaml.v3" | ||
|
||
cli "github.com/urfave/cli/v2" | ||
|
||
v1 "gitlab.com/nvidia/cloud-native/vgpu-device-manager/api/spec/v1" | ||
Check failure on line 31 in tools/vgpu-config/generate.go GitHub Actions / build
|
||
"gitlab.com/nvidia/cloud-native/vgpu-device-manager/pkg/types" | ||
Check failure on line 32 in tools/vgpu-config/generate.go GitHub Actions / build
|
||
) | ||
|
||
const ( | ||
// TODO: make the default's configurable | ||
defaultVGPUConfigName = "default" | ||
framebufferPolicyMax = "max" | ||
framebufferPolicyHalf = "half" | ||
framebufferPolicyMin = "min" | ||
defaultFramebufferPolicy = framebufferPolicyHalf | ||
) | ||
|
||
// Generate converts 'vgpuConfig.xml' into a configuration file (yaml) for the vGPU Device Manager | ||
func Generate(c *cli.Context, f *flags) error { | ||
xmlFile, err := parseXMLFile(f) | ||
if err != nil { | ||
return fmt.Errorf("error parsing xml file: %v", err) | ||
} | ||
|
||
// Mapping between vGPU type id and vGPU type information in the xml file | ||
idToType := map[int]VGPUType{} | ||
for _, v := range xmlFile.VGPUTypes { | ||
idToType[v.ID] = v | ||
} | ||
|
||
// Initialize the vGPU Device Manager configuration spec | ||
spec := v1.Spec{ | ||
Version: "v1", | ||
VGPUConfigs: map[string]v1.VGPUConfigSpecSlice{}, | ||
} | ||
|
||
// The default configuration will contain one entry per physical GPU supported | ||
defaultConfig := v1.VGPUConfigSpecSlice{} | ||
|
||
for _, p := range xmlFile.PGPUs { | ||
// Mapping VGPU series to the list of supported VGPU types for the PGPU. | ||
// Will use this later when picking a default vGPU type for the PGPU. | ||
supportedVGPUs := map[types.Series][]*types.VGPUType{} | ||
for _, v := range p.SupportedVGPUs { | ||
// Only process vGPU types of class 'Quadro' or 'Compute'. | ||
// This restriction may be relaxed in the future. | ||
class := idToType[v.ID].Class | ||
if class == "NVS" { | ||
continue | ||
} | ||
|
||
_type := idToType[v.ID] | ||
// Strip the leading NVIDIA|GRID from the type name | ||
split := strings.SplitN(_type.Name, " ", 2) | ||
if len(split) != 2 { | ||
return fmt.Errorf("error splitting vGPU type name: %s", _type.Name) | ||
} | ||
|
||
vgpuType, err := types.ParseVGPUType(split[1]) | ||
if err != nil { | ||
return fmt.Errorf("could not parse vGPU type '%s': %v", _type.Name, err) | ||
} | ||
|
||
// Add entry for this vGPU Type in the config | ||
vgpuTypeStr := vgpuType.String() | ||
spec.VGPUConfigs[vgpuTypeStr] = v1.VGPUConfigSpecSlice{ | ||
v1.VGPUConfigSpec{ | ||
Devices: "all", | ||
VGPUDevices: types.VGPUConfig{ | ||
vgpuTypeStr: v.MaxVGPUs, | ||
}, | ||
}, | ||
} | ||
|
||
// Only consider non MIG-backed types later on when picking a default type for the PGPU. | ||
// Note: 'G' is the number of GPU instances | ||
if vgpuType.G == 0 { | ||
supportedVGPUs[vgpuType.S] = append(supportedVGPUs[vgpuType.S], vgpuType) | ||
} | ||
} | ||
|
||
// The below picks a default vGPU type for the PGPU. A Q-series type is selected by default | ||
// unless the PGPU does not support Q-series, then C-series is used. | ||
vgpuSlice := supportedVGPUs['Q'] | ||
if len(supportedVGPUs['Q']) == 0 && len(supportedVGPUs['C']) == 0 { | ||
continue | ||
} | ||
if len(supportedVGPUs['Q']) == 0 { | ||
vgpuSlice = supportedVGPUs['C'] | ||
} | ||
|
||
defaultVGPUType, err := getDefaultVGPUType(vgpuSlice, defaultFramebufferPolicy) | ||
if err != nil { | ||
return fmt.Errorf("error getting default vGPU type: %v", err) | ||
} | ||
|
||
defaultName := defaultVGPUType.String() | ||
numInstances := spec.VGPUConfigs[defaultName][0].VGPUDevices[defaultName] | ||
|
||
deviceFilter, err := getDeviceFilterString(p.DeviceID) | ||
if err != nil { | ||
return fmt.Errorf("error getting device filter: %v", err) | ||
} | ||
|
||
// Add default config entry for the PGPU | ||
defaultConfig = append(defaultConfig, v1.VGPUConfigSpec{ | ||
DeviceFilter: deviceFilter, | ||
Devices: "all", | ||
VGPUDevices: types.VGPUConfig{ | ||
defaultName: numInstances, | ||
}, | ||
}) | ||
} | ||
|
||
spec.VGPUConfigs[defaultVGPUConfigName] = defaultConfig | ||
|
||
data, err := yaml.Marshal(&spec) | ||
if err != nil { | ||
return fmt.Errorf("error marshalling data: %v", err) | ||
} | ||
|
||
err = os.WriteFile(f.outputFile, data, 0600) | ||
if err != nil { | ||
return fmt.Errorf("could not write to file: %v", err) | ||
} | ||
return nil | ||
} | ||
|
||
func parseXMLFile(f *flags) (*VGPUConfig, error) { | ||
xmlFile, err := os.ReadFile(f.xmlFile) | ||
if err != nil { | ||
return nil, fmt.Errorf("error reading file: %v", err) | ||
} | ||
|
||
var vgpuConfig VGPUConfig | ||
err = xml.Unmarshal(xmlFile, &vgpuConfig) | ||
if err != nil { | ||
return nil, fmt.Errorf("unmarshal error: %v", err) | ||
} | ||
|
||
return &vgpuConfig, nil | ||
} | ||
|
||
func stripVGPUTypeName(s string) (string, error) { | ||
// Type name in the format: [NVIDIA|GRID] <vGPU type> | ||
typeStr := strings.TrimSpace(s) | ||
typeSplit := strings.SplitN(typeStr, " ", 2) | ||
if len(typeSplit) != 2 { | ||
return "", fmt.Errorf("malformed vGPU type name: %s", s) | ||
} | ||
|
||
return typeSplit[1], nil | ||
} | ||
|
||
func getVGPUDevicesSpec(v SupportedVGPU, idToType *map[int]VGPUType) (types.VGPUConfig, error) { | ||
vgpuType := (*idToType)[v.ID] | ||
|
||
typeName, err := stripVGPUTypeName(vgpuType.Name) | ||
if err != nil { | ||
return nil, fmt.Errorf("error parsing vGPU type name for id %d: %v", v.ID, err) | ||
} | ||
|
||
vgpuDevices := types.VGPUConfig{ | ||
typeName: v.MaxVGPUs, | ||
} | ||
|
||
return vgpuDevices, nil | ||
} | ||
|
||
func getDefaultVGPUType(vgpuTypes []*types.VGPUType, policy string) (*types.VGPUType, error) { | ||
// Sort in descending order by framebuffer size in GB | ||
sort.Slice(vgpuTypes, func(i, j int) bool { | ||
return vgpuTypes[i].GB > vgpuTypes[j].GB | ||
}) | ||
|
||
if len(vgpuTypes) == 0 { | ||
return nil, fmt.Errorf("no vGPU types") | ||
} | ||
// For GH200, there is only one valid vGPU type, GH200-96C, when MIG is not enabled | ||
if len(vgpuTypes) == 1 { | ||
return vgpuTypes[0], nil | ||
} | ||
|
||
halfGB := vgpuTypes[0].GB / 2 | ||
switch policy { | ||
case framebufferPolicyMax: | ||
return vgpuTypes[0], nil | ||
case framebufferPolicyMin: | ||
return vgpuTypes[len(vgpuTypes)-1], nil | ||
case framebufferPolicyHalf: | ||
for i, v := range vgpuTypes { | ||
if v.GB == halfGB { | ||
return vgpuTypes[i], nil | ||
} | ||
} | ||
return nil, fmt.Errorf("error finding a vGPU type with half the max framebuffer size") | ||
default: | ||
return nil, fmt.Errorf("invalid policy '%s' for selecting default vGPU type", policy) | ||
} | ||
} | ||
|
||
func getDeviceFilterString(deviceInfo DeviceID) (string, error) { | ||
deviceID, err := strconv.ParseUint(deviceInfo.DeviceID, 0, 16) | ||
if err != nil { | ||
return "", fmt.Errorf("unable to convert device id string to uint16: %v", err) | ||
} | ||
|
||
vendorID, err := strconv.ParseUint(deviceInfo.VendorID, 0, 16) | ||
if err != nil { | ||
return "", fmt.Errorf("unable to convert vendor id string to uint16: %v", err) | ||
} | ||
|
||
deviceFilter := types.NewDeviceID(uint16(deviceID), uint16(vendorID)) | ||
return deviceFilter.String(), nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/* | ||
* Copyright (c) NVIDIA CORPORATION. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
|
||
log "github.com/sirupsen/logrus" | ||
cli "github.com/urfave/cli/v2" | ||
) | ||
|
||
type flags struct { | ||
xmlFile string | ||
outputFile string | ||
} | ||
|
||
func main() { | ||
flags := flags{} | ||
|
||
c := cli.NewApp() | ||
c.Name = "vgpu-config" | ||
c.Usage = "Manage configuration files for NVIDIA vGPU Device Manager" | ||
c.Version = "0.1.0" | ||
|
||
generate := cli.Command{} | ||
generate.Name = "generate" | ||
generate.Usage = "Generate a vGPU device configuration file from an xml file (vgpuConfig.xml)" | ||
generate.Before = func(c *cli.Context) error { | ||
return validateFlags(&flags) | ||
} | ||
generate.Action = func(c *cli.Context) error { | ||
return Generate(c, &flags) | ||
} | ||
|
||
// Register the subcommand with the top-level CLI | ||
c.Commands = []*cli.Command{ | ||
&generate, | ||
} | ||
|
||
generate.Flags = []cli.Flag{ | ||
&cli.StringFlag{ | ||
Name: "xml-file", | ||
Aliases: []string{"f"}, | ||
Usage: "Path to the xml file", | ||
Required: true, | ||
Destination: &flags.xmlFile, | ||
EnvVars: []string{"XML_FILE"}, | ||
}, | ||
&cli.StringFlag{ | ||
Name: "output-file", | ||
Aliases: []string{"o"}, | ||
Required: true, | ||
Usage: "Path to the output file", | ||
Destination: &flags.outputFile, | ||
EnvVars: []string{"OUTPUT_FILE"}, | ||
}, | ||
} | ||
|
||
if err := c.Run(os.Args); err != nil { | ||
log.Fatal(fmt.Errorf("error: %v", err)) | ||
} | ||
} | ||
|
||
func validateFlags(f *flags) error { | ||
if f.xmlFile == "" { | ||
return fmt.Errorf("invalid --xml-file option: %v", f.xmlFile) | ||
} | ||
if f.outputFile == "" { | ||
return fmt.Errorf("invalid --output-file option: %v", f.outputFile) | ||
} | ||
|
||
return nil | ||
} |
Oops, something went wrong.