Skip to content

Latest commit

 

History

History
403 lines (389 loc) · 21.8 KB

SWOP_profile_schema.md

File metadata and controls

403 lines (389 loc) · 21.8 KB
NOTE:

The source code of the profile schema SWOP-profile-schema.yaml.

######################################################################
## (C)Copyright 2021-2023 Hewlett Packard Enterprise Development LP
######################################################################
"$schema"   : "https://json-schema.org/draft/2020-12/schema"
title       : "SWOP Profile definition"
description : "Describes the structure of the profile file"
type        : object
properties  : 
    groupname  :
        type        : string
        description : "name of the profile group" 
    taskrunner : 
        type        : string
        description : "id of the taskrunner contract to use" 
    policyuri  : 
        description : "None (no policy accept all tasks) or uri to the policy engine" 
        oneOf : 
            - const : ~
            - type  : string
    resourcemgr     : 
        type         : object
        description  : "Specification of the underlying resourcemgr.
                        Currently only DOCKER is supported"
        properties   :
            mgrtype : 
                description  : "Identifies the resource manager type"
                type : string
                enum : 
                    - DOCKER
            accessinfo : 
                oneOf  :
                    - type  : object                        
                      description : "Specifies how to instantiate docker client 
                                     FROMENV - Return a client configured from environment variables. 
                                     The environment variables used are the same as those 
                                     used by the Docker command-line client. Currently FROMENV only supported.
                                     It can be extended to instantiate client that will allow you to communicate 
                                     with a Docker daemon through UNIX socket or SSL"
                      properties  :
                          accesstype: 
                              type : string 
                              enum : 
                                  - FROMENV 
                      required    :
                          - accesstype
                      additionalProperties : false                      
        required     : 
            - mgrtype
            - accessinfo
        additionalProperties : false        
    network    :
        type        : string
        description : "Nework to attach the SL containers" 
    apls       : 
        type        : object
        description : "APLS object definition"
        properties  :
            locator : 
                type        : "object"
                properties  : 
                    host  : 
                        type : string
                    port : 
                        oneOf :  
                            - type : integer 
                              minimum: 0 
                            - const : ~
                required : 
                    - host
                additionalProperties : false                    
        required    : 
            - locator
        additionalProperties : false                      
    apisrv     :
        type         : object
        description  : "API SERVER object definition"
        properties   :
            locator  :
                type       : object
                properties :
                    host  :
                        oneOf:
                            - type: string
                              maxLength: 253
                              minLength: 1
                            - const : ~
                    port :
                        oneOf :
                            - type : integer
                              minimum: 0
                            - const : ~
                    service:
                        oneOf:
                            - type: string
                              maxLength: 253
                              minLength: 1
                            - const : ~
                required :
                    - host
                    - service
                additionalProperties : false
    envvars    : 
        oneOf       : 
          - const   : ~
          - type    : array
            description : "List of global environmental variables as K-V pair common to all SL nodes"
            items       : 
                type        : object
                uniqueItems : true

    # Allow service endpoints to be specified as domain names, rather than IPs.
    # Traffic can be routed through a reverse proxy or a web gateway by making
    # these name servers resolve the domain names into the IP of the reverse 
    # proxy and then, configuring the reverse proxy to forward to the appropriate 
    # service. This is at a global level for now - primarily because we cannot 
    # think of a use case for two containers on a production box wanting to use 
    # different name servers. If and when someone demonstrates the scenario, we 
    # can move this field down to the nodes section. To limit the current scope 
    # of work, we do not support any of the other related parameters, such as
    # dns-opt, dns-search and domain-name. Further, we feel we should explore
    # ways to create an open specification that can accept all valid parameters
    # and not just a chosen handful.
    dns:
        oneOf:
          - const: ~
          - type: array
            description: "List of custom DNS servers"
            items:
                type: string
                maxLength: 253
                minLength: 1
                uniqueItems: true

    nodes      : 
        type        : array
        description : "List of Node specifications managed by this SWOP entity"
        items       :
            - slnodedef : 
                  type       : object 
                  properties : 
                      idx         :
                          type    : integer 
                          description : "Identifier for the SL node in this SWOP profile"
                          # this index is provided by user it has to start with 0 
                          # and be positive integer only
                          minimum : 0
                      identity    :
                        type        : array
                        description : "Information about identity of this node"
                        items       :
                            - attribute  :  
                                type        : object
                                description : "Specification of the certificates or spiffe endpoint"
                                properties  :
                                    aType       :
                                        type        : string                        
                                        description : "Identity specification attribute type" 
                                        enum        : 
                                            - KEY
                                            - CERT 
                                            - CACERT
                                            - CAPATH
                                            - SPIFFE_SOCKETPATH
                                    mType       :
                                        type        : string
                                        description : "Type of mount to be used to mount the user data"
                                        enum        : 
                                            - BIND
                                            - VOLUME
                                    src          : 
                                        type        : string
                                        description : "Host directory or fullpath or Volume name to be used for Mount"
                                        maxLength   : 240
                                        minLength   : 2 
                                    tgt          : 
                                        type        : string
                                        description : "Target directory or fullpath inside container"
                                        maxLength   : 240
                                        minLength   : 2 
                                    subPath         : 
                                        oneOf : 
                                            - const   : ~
                                            - type        : string
                                              description : "Optional sub-path to the resource realtive to tgt"
                                              maxLength   : 240
                                              minLength   : 2
                                required    : 
                                    - aType
                                    - mType
                                    - src
                                    - tgt
                                    - subpath
                                additionalProperties : false
                      slhostip    :
                          oneOf:
                              - const: ~
                              - type    : string
                                description : "Externally visible IPv4 Address or FQDN of the SL Container"
                                maxLength   : 240
                                minLength   : 2                          
                      slport      :
                          oneOf : 
                              - const   : ~
                              - type    : integer 
                                description : "FS Server port exposed by this SL NODE (default 30305)"
                                minimum : 0
                      slfsservice:
                          oneOf:
                              - const: ~
                              - type: string
                                description: "FQDN and optional port of the FS service for the SL Container"
                                # https://stackoverflow.com/a/32294443.
                                maxLength: 253
                                minLength: 2
                      slhostname :
                          oneOf : 
                              - const   : ~
                              - type    : string
                                description : "Docker host name for the SL container"
                                maxLength   : 240
                                minLength   : 2                                                          
                      privatedata :          
                          oneOf : 
                              - const   : ~
                              - type    : object 
                                description : "Private data store this node exposes as bind or volume mount"
                                properties:
                                    src :
                                        type        : string
                                        description : "Source Volume or directory"
                                        maxLength   : 240
                                        minLength   : 2
                                    # destination will come from the RUN-SWARM task definition 
                                    # the program will decide where in the Filesystem this dir 
                                    # will be mounted to.
                                    mType :
                                        type : string
                                        description : "Type of mount to be used to mount the user data"
                                        enum : 
                                            - BIND
                                            - VOLUME                                
                                required:
                                    - src
                                    - mType
                                additionalProperties: false
                      slenvvars   : 
                          oneOf       : 
                              - const   : ~
                              - type    : array
                                description : "List of local environmental variables as K-V pair specific to SL node.
                                               This will take precedence over global environmental variables."
                                items       : 
                                    type        : object
                                    uniqueItems : true

                      # In our practice, we are using labels only for the SPIFFE selector not envvars.
                      sllabels:
                          oneOf:
                            - const: ~
                            - type: object
                              description: "Dictionary of labels for the Swarm Learning container"
                              items:
                                  uniqueItems: true
                            - type: array
                              description: "List of labels with empty values for the Swarm Learning container"
                              items:
                                  type: string
                                  uniqueItems: true
                      # This attribute helps in starting SL container with pre-defined python docker network SDK option called "ipv4_address"
                      # Added as part of reverse proxy multi-host scenario where instead of using hostIP with in nginx configuration we can
                      # now use continer IP's for easier and simpler IP and port mapping.
                      slnetworkopts   : 
                          oneOf       : 
                              - const   : ~
                              - type    : array
                                description : "List of K-V pairs passed directly to connect SL docker container to specific network.
                                              This option is needed to start the SL container with a static IP. The IP address is 
                                              passed using 'ip' key inside slnetworkopts. This will get converted to ipv4_address 
                                              argument of the python docker sdk network connect method. This is same as passing '--ip' 
                                              argument to the docker cli run command.
                                              "
                                items : 
                                    type : object
                                    uniqueItems : true

                      usrhostname :
                          oneOf : 
                              - const   : ~
                              - type    : string
                                description : "Externally visible IPv4 Address or FQDN of the USR Container"
                                maxLength   : 240
                                minLength   : 2  
                      usrenvvars   : 
                          oneOf       : 
                              - const   : ~
                              - type    : array
                                description : "List of local environmental variables as K-V pair specific to USR Container.
                                               This will take precedence over global environmental variables specified in 
                                               run task definition."
                                items       : 
                                    type        : object
                                    uniqueItems : true

                      usrlabels:
                          oneOf:
                            - const: ~
                            - type: object
                              description: "Dictionary of labels for the user ML container"
                              items:
                                  uniqueItems: true
                            - type: array
                              description: "List of labels with empty values for the user ML container"
                              items:
                                  type: string
                                  uniqueItems: true
                      # This attribute helps in starting ML container with pre-defined python docker network SDK options called "ipv4_address"
                      # Added as part of reverse proxy multi-host scenario where instead of using hostIP with in nginx configuration we can
                      # now use continer IP's for easier and simpler IP and port mapping.
                      usrnetworkopts   : 
                          oneOf       : 
                              - const   : ~
                              - type    : array
                                description : "List of K-V pairs passed directly to connect user docker container to specific network.
                                              This option is needed to start the user container with a static IP. The IP address is 
                                              passed using 'ip' key inside usrnetworkopts. This will get converted to ipv4_address 
                                              argument of the python docker sdk network connect method. This is same as passing '--ip' 
                                              argument to the docker cli run command.
                                              "
                                items : 
                                    type : object
                                    uniqueItems : true
                      usrcontaineropts   : 
                          oneOf       : 
                              - const   : ~
                              - type    : array
                                description : |
                                               "List of K-V pairs passed directly to docker while starting USR container.
                                               These options are needed to use GPUs for user ML local training.
                                               These options are specific to GPU vendors. Refer options as applicable to your GPU vendor."
                                               1. Nvidia GPUs -   
                                               "Only 'gpus' key is supported. Refer below link to know how to specify the values. 
                                               https://docs.docker.com/config/containers/resource_constraints/#gpu "
                                               
                                               2. AMD GPUs - 
                                               "Keys supported are 'device', 'ipc', 'shm-size', 'group-add',
                                               'cap-add', 'security-opt', 'privileged'.
                                               Refer below link to know the options for AMD GPU access.
                                               https://developer.amd.com/resources/rocm-learning-center/deep-learning/
                                               
                                               Required options varies for tensorflow and pytorch.
                                               Tensorflow ====>> sudo docker run -it --network=host --device=/dev/kfd --device=/dev/dri 
                                               --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE 
                                               --security-opt seccomp=unconfined -v $HOME/dockerx:/dockerx rocm/tensorflow:latest
                                               PyTorch ====>> sudo docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd 
                                               --device=/dev/dri --group-add video rocm/pytorch:rocm3.5_ubuntu16.04_py3.6_pytorch"
                                               
                                               Example for specifing values - 
                                               - device : ["/dev/kfd", "/dev/dri"] # (list of str) -> Expose host devices to the container, 
                                                                                     as a list of strings
                                               - ipc : "host"                      # (str) -> Set the IPC mode for the container
                                               - shm-size : "16G"                  # (str) -> Size of /dev/shm (e.g. 1G)
                                               - group-add : ["video"]             # (list of str) -> List of additional group names 
                                                                                     and/or IDs that the container process will run as.
                                               - cap-add : ["SYS_PTRACE"]          # (list of str) -> Add kernel capabilities. 
                                               - security-opt : ["seccomp=unconfined"] # (list of str) -> A list of string values to customize labels 
                                                                                         for MLS systems, such as SELinux.
                                               - privileged : True                 # (bool) -> Give extended privileges to this container.
                                               
                                items : 
                                    type : object
                                    uniqueItems : true
                  required   :
                      - idx
                      - identity
                      - slhostname
                      - privatedata
                      - slenvvars
                      - usrhostname
                      - usrenvvars
                  additionalProperties: false 
        uniqueItems : true

required    : 
    - groupname
    - taskrunner
    - policyuri
    - resourcemgr
    - network 
    - apls
    - apisrv
    - envvars
    - nodes
additionalProperties: false