forked from kata-containers/kata-containers
-
Notifications
You must be signed in to change notification settings - Fork 3
/
container.go
1545 lines (1300 loc) · 47.7 KB
/
container.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright (c) 2016 Intel Corporation
// Copyright (c) 2014,2015,2016,2017 Docker, Inc.
// SPDX-License-Identifier: Apache-2.0
//
package virtcontainers
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"syscall"
"time"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// tracingTags defines tags for the trace span
var containerTracingTags = map[string]string{
"source": "runtime",
"package": "virtcontainers",
"subsystem": "container",
}
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
// This file has definitions for major device numbers.
var cdromMajors = map[int64]string{
11: "SCSI_CDROM_MAJOR",
15: "CDU31A_CDROM_MAJOR",
16: "GOLDSTAR_CDROM_MAJOR",
17: "OPTICS_CDROM_MAJOR",
18: "SANYO_CDROM_MAJOR",
20: "MITSUMI_X_CDROM_MAJOR",
23: "MITSUMI_CDROM_MAJOR",
24: "CDU535_CDROM_MAJOR",
25: "MATSUSHITA_CDROM_MAJOR",
26: "MATSUSHITA_CDROM2_MAJOR",
27: "MATSUSHITA_CDROM3_MAJOR",
28: "MATSUSHITA_CDROM4_MAJOR",
29: "AZTECH_CDROM_MAJOR",
32: "CM206_CDROM_MAJOR",
}
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
// #define FLOPPY_MAJOR 2
const floppyMajor = int64(2)
// Process gathers data related to a container process.
type Process struct {
StartTime time.Time
// Token is the process execution context ID. It must be
// unique per sandbox.
// Token is used to manipulate processes for containers
// that have not started yet, and later identify them
// uniquely within a sandbox.
Token string
// Pid is the process ID as seen by the host software
// stack, e.g. CRI-O, containerd. This is typically the
// shim PID.
Pid int
}
// ContainerStatus describes a container status.
type ContainerStatus struct {
Spec *specs.Spec
// Annotations allow clients to store arbitrary values,
// for example to add additional status values required
// to support particular specifications.
Annotations map[string]string
ID string
RootFs string
StartTime time.Time
State types.ContainerState
PID int
}
// ThrottlingData gather the date related to container cpu throttling.
type ThrottlingData struct {
// Number of periods with throttling active
Periods uint64 `json:"periods,omitempty"`
// Number of periods when the container hit its throttling limit.
ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
// Aggregate time the container was throttled for in nanoseconds.
ThrottledTime uint64 `json:"throttled_time,omitempty"`
}
// CPUUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception.
type CPUUsage struct {
// Total CPU time consumed per core.
// Units: nanoseconds.
PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
// Total CPU time consumed.
// Units: nanoseconds.
TotalUsage uint64 `json:"total_usage,omitempty"`
// Time spent by tasks of the cgroup in kernel mode.
// Units: nanoseconds.
UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
// Time spent by tasks of the cgroup in user mode.
// Units: nanoseconds.
UsageInUsermode uint64 `json:"usage_in_usermode"`
}
// CPUStats describes the cpu stats
type CPUStats struct {
CPUUsage CPUUsage `json:"cpu_usage,omitempty"`
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
}
// MemoryData gather the data related to memory
type MemoryData struct {
Usage uint64 `json:"usage,omitempty"`
MaxUsage uint64 `json:"max_usage,omitempty"`
Failcnt uint64 `json:"failcnt"`
Limit uint64 `json:"limit"`
}
// MemoryStats describes the memory stats
type MemoryStats struct {
Stats map[string]uint64 `json:"stats,omitempty"`
// usage of memory
Usage MemoryData `json:"usage,omitempty"`
// usage of memory swap
SwapUsage MemoryData `json:"swap_usage,omitempty"`
// usage of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
// memory used for cache
Cache uint64 `json:"cache,omitempty"`
// if true, memory usage is accounted for throughout a hierarchy of cgroups.
UseHierarchy bool `json:"use_hierarchy"`
}
// PidsStats describes the pids stats
type PidsStats struct {
// number of pids in the cgroup
Current uint64 `json:"current,omitempty"`
// active pids hard limit
Limit uint64 `json:"limit,omitempty"`
}
// BlkioStatEntry gather date related to a block device
type BlkioStatEntry struct {
Op string `json:"op,omitempty"`
Major uint64 `json:"major,omitempty"`
Minor uint64 `json:"minor,omitempty"`
Value uint64 `json:"value,omitempty"`
}
// BlkioStats describes block io stats
type BlkioStats struct {
// number of bytes tranferred to and from the block device
IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
}
// HugetlbStats describes hugetable memory stats
type HugetlbStats struct {
// current res_counter usage for hugetlb
Usage uint64 `json:"usage,omitempty"`
// maximum usage ever recorded.
MaxUsage uint64 `json:"max_usage,omitempty"`
// number of times hugetlb usage allocation failure.
Failcnt uint64 `json:"failcnt"`
}
// CgroupStats describes all cgroup subsystem stats
type CgroupStats struct {
// the map is in the format "size of hugepage: stats of the hugepage"
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
CPUStats CPUStats `json:"cpu_stats,omitempty"`
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
PidsStats PidsStats `json:"pids_stats,omitempty"`
}
// NetworkStats describe all network stats.
type NetworkStats struct {
// Name is the name of the network interface.
Name string `json:"name,omitempty"`
RxBytes uint64 `json:"rx_bytes,omitempty"`
RxPackets uint64 `json:"rx_packets,omitempty"`
RxErrors uint64 `json:"rx_errors,omitempty"`
RxDropped uint64 `json:"rx_dropped,omitempty"`
TxBytes uint64 `json:"tx_bytes,omitempty"`
TxPackets uint64 `json:"tx_packets,omitempty"`
TxErrors uint64 `json:"tx_errors,omitempty"`
TxDropped uint64 `json:"tx_dropped,omitempty"`
}
// ContainerStats describes a container stats.
type ContainerStats struct {
CgroupStats *CgroupStats
NetworkStats []*NetworkStats
}
// ContainerResources describes container resources
type ContainerResources struct {
// VCPUs are the number of vCPUs that are being used by the container
VCPUs uint32
// Mem is the memory that is being used by the container
MemByte int64
}
// ContainerConfig describes one container runtime configuration.
type ContainerConfig struct {
// Device configuration for devices that must be available within the container.
DeviceInfos []config.DeviceInfo
Mounts []Mount
// Raw OCI specification, it won't be saved to disk.
CustomSpec *specs.Spec `json:"-"`
// Annotations allow clients to store arbitrary values,
// for example to add additional status values required
// to support particular specifications.
Annotations map[string]string
ID string
// Resources container resources
Resources specs.LinuxResources
// Cmd specifies the command to run on a container
Cmd types.Cmd
// RootFs is the container workload image on the host.
RootFs RootFs
// ReadOnlyRootfs indicates if the rootfs should be mounted readonly
ReadonlyRootfs bool
}
// valid checks that the container configuration is valid.
func (c *ContainerConfig) valid() bool {
if c == nil {
return false
}
if c.ID == "" {
return false
}
return true
}
// SystemMountsInfo describes additional information for system mounts that the agent
// needs to handle
type SystemMountsInfo struct {
// Indicates if /dev has been passed as a bind mount for the host /dev
BindMountDev bool
// Size of /dev/shm assigned on the host.
DevShmSize uint
}
// ContainerDevice describes a device associated with container
type ContainerDevice struct {
// ID is device id referencing the device from sandbox's device manager
ID string
// ContainerPath is device path displayed in container
ContainerPath string
// FileMode permission bits for the device.
FileMode os.FileMode
// UID is user ID in the container namespace
UID uint32
// GID is group ID in the container namespace
GID uint32
}
// RootFs describes the container's rootfs.
type RootFs struct {
// Source specifies the BlockDevice path
Source string
// Target specify where the rootfs is mounted if it has been mounted
Target string
// Type specifies the type of filesystem to mount.
Type string
// Options specifies zero or more fstab style mount options.
Options []string
// Mounted specifies whether the rootfs has be mounted or not
Mounted bool
}
// Container is composed of a set of containers and a runtime environment.
// A Container can be created, deleted, started, stopped, listed, entered, paused and restored.
type Container struct {
ctx context.Context
config *ContainerConfig
sandbox *Sandbox
id string
sandboxID string
containerPath string
rootfsSuffix string
mounts []Mount
devices []ContainerDevice
state types.ContainerState
process Process
rootFs RootFs
systemMountsInfo SystemMountsInfo
}
// ID returns the container identifier string.
func (c *Container) ID() string {
return c.id
}
// Logger returns a logrus logger appropriate for logging Container messages
func (c *Container) Logger() *logrus.Entry {
return virtLog.WithFields(logrus.Fields{
"subsystem": "container",
"sandbox": c.sandboxID,
"container": c.id,
})
}
// Sandbox returns the sandbox handler related to this container.
func (c *Container) Sandbox() VCSandbox {
return c.sandbox
}
// Process returns the container process.
func (c *Container) Process() Process {
return c.process
}
// GetToken returns the token related to this container's process.
func (c *Container) GetToken() string {
return c.process.Token
}
// GetPid returns the pid related to this container's process.
func (c *Container) GetPid() int {
return c.process.Pid
}
func (c *Container) setStateFstype(fstype string) error {
c.state.Fstype = fstype
return nil
}
// GetAnnotations returns container's annotations
func (c *Container) GetAnnotations() map[string]string {
return c.config.Annotations
}
// GetPatchedOCISpec returns container's OCI specification
// This OCI specification was patched when the sandbox was created
// by containerCapabilities(), SetEphemeralStorageType() and others
// in order to support:
// * Capabilities
// * Ephemeral storage
// * k8s empty dir
// If you need the original (vanilla) OCI spec,
// use compatoci.GetContainerSpec() instead.
func (c *Container) GetPatchedOCISpec() *specs.Spec {
return c.config.CustomSpec
}
// setContainerState sets both the in-memory and on-disk state of the
// container.
func (c *Container) setContainerState(state types.StateString) error {
if state == "" {
return types.ErrNeedState
}
c.Logger().Debugf("Setting container state from %v to %v", c.state.State, state)
// update in-memory state
c.state.State = state
// flush data to storage
if err := c.sandbox.Save(); err != nil {
return err
}
return nil
}
// mountSharedDirMounts handles bind-mounts by bindmounting to the host shared
// directory which is mounted through virtiofs/9pfs in the VM.
// It also updates the container mount list with the HostPath info, and store
// container mounts to the storage. This way, we will have the HostPath info
// available when we will need to unmount those mounts.
func (c *Container) mountSharedDirMounts(ctx context.Context, sharedDirMounts, ignoredMounts map[string]Mount) (storages []*grpc.Storage, err error) {
var devicesToDetach []string
defer func() {
if err != nil {
for _, id := range devicesToDetach {
c.sandbox.devManager.DetachDevice(ctx, id, c.sandbox)
}
}
}()
for idx, m := range c.mounts {
// Skip mounting certain system paths from the source on the host side
// into the container as it does not make sense to do so.
// Example sources could be /sys/fs/cgroup etc.
if isSystemMount(m.Source) {
continue
}
// Check if mount is a block device file. If it is, the block device will be attached to the host
// instead of passing this as a shared mount:
if len(m.BlockDeviceID) > 0 {
// Attach this block device, all other devices passed in the config have been attached at this point
if err = c.sandbox.devManager.AttachDevice(ctx, m.BlockDeviceID, c.sandbox); err != nil {
return storages, err
}
devicesToDetach = append(devicesToDetach, m.BlockDeviceID)
continue
}
// For non-block based mounts, we are only interested in bind mounts
if m.Type != "bind" {
continue
}
// We need to treat /dev/shm as a special case. This is passed as a bind mount in the spec,
// but it does not make sense to pass this as a 9p mount from the host side.
// This needs to be handled purely in the guest, by allocating memory for this inside the VM.
if m.Destination == "/dev/shm" {
continue
}
// Ignore /dev, directories and all other device files. We handle
// only regular files in /dev. It does not make sense to pass the host
// device nodes to the guest.
if isHostDevice(m.Destination) {
continue
}
sharedFile, err := c.sandbox.fsShare.ShareFile(ctx, c, &c.mounts[idx])
if err != nil {
return storages, err
}
// Expand the list of mounts to ignore.
if sharedFile == nil {
ignoredMounts[m.Source] = Mount{Source: m.Source}
continue
}
sharedDirMount := Mount{
Source: sharedFile.guestPath,
Destination: m.Destination,
Type: m.Type,
Options: m.Options,
ReadOnly: m.ReadOnly,
}
// virtiofs does not support inotify. To workaround this limitation, we want to special case
// mounts that are commonly 'watched'. "watchable" mounts include:
// - Kubernetes configmap
// - Kubernetes secret
// If we identify one of these, we'll need to carry out polling in the guest in order to present the
// container with a mount that supports inotify. To do this, we create a Storage object for
// the "watchable-bind" driver. This will have the agent create a new mount that is watchable,
// who's effective source is the original mount (the agent will poll the original mount for changes and
// manually update the path that is mounted into the container).
// Based on this, let's make sure we update the sharedDirMount structure with the new watchable-mount as
// the source (this is what is utilized to update the OCI spec).
caps := c.sandbox.hypervisor.Capabilities(ctx)
if isWatchableMount(m.Source) && caps.IsFsSharingSupported() {
// Create path in shared directory for creating watchable mount:
watchableHostPath := filepath.Join(getMountPath(c.sandboxID), "watchable")
if err := os.MkdirAll(watchableHostPath, DirMode); err != nil {
return storages, fmt.Errorf("unable to create watchable path: %s: %v", watchableHostPath, err)
}
watchableGuestMount := filepath.Join(kataGuestSharedDir(), "watchable", filepath.Base(sharedFile.guestPath))
storage := &grpc.Storage{
Driver: kataWatchableBindDevType,
Source: sharedFile.guestPath,
Fstype: "bind",
MountPoint: watchableGuestMount,
Options: m.Options,
}
storages = append(storages, storage)
// Update the sharedDirMount, in order to identify what will
// change in the OCI spec.
sharedDirMount.Source = watchableGuestMount
}
sharedDirMounts[sharedDirMount.Destination] = sharedDirMount
}
return storages, nil
}
func (c *Container) unmountHostMounts(ctx context.Context) error {
span, ctx := katatrace.Trace(ctx, c.Logger(), "unmountHostMounts", containerTracingTags, map[string]string{"container_id": c.id})
defer span.End()
unmountFunc := func(m Mount) (err error) {
span, _ := katatrace.Trace(ctx, c.Logger(), "unmount", containerTracingTags, map[string]string{"container_id": c.id, "host-path": m.HostPath})
defer func() {
if err != nil {
katatrace.AddTags(span, "error", err)
}
span.End()
}()
if err = c.sandbox.fsShare.UnshareFile(ctx, c, &m); err != nil {
c.Logger().WithFields(logrus.Fields{
"host-path": m.HostPath,
"error": err,
}).Warn("Could not umount")
return err
}
return nil
}
for _, m := range c.mounts {
if m.HostPath != "" {
if err := unmountFunc(m); err != nil {
return err
}
}
}
return nil
}
func filterDevices(c *Container, devices []ContainerDevice) (ret []ContainerDevice) {
for _, dev := range devices {
major, _ := c.sandbox.devManager.GetDeviceByID(dev.ID).GetMajorMinor()
if _, ok := cdromMajors[major]; ok {
c.Logger().WithFields(logrus.Fields{
"device": dev.ContainerPath,
}).Info("Not attach device because it is a CDROM")
continue
}
if major == floppyMajor {
c.Logger().WithFields(logrus.Fields{
"device": dev.ContainerPath,
}).Info("Not attaching device because it is a floppy drive")
continue
}
ret = append(ret, dev)
}
return
}
// Add any mount based block devices to the device manager and Save the
// device ID for the particular mount. This'll occur when the mountpoint source
// is a block device.
func (c *Container) createBlockDevices(ctx context.Context) error {
if !c.checkBlockDeviceSupport(ctx) {
c.Logger().Warn("Block device not supported")
return nil
}
// iterate all mounts and create block device if it's block based.
for i := range c.mounts {
if len(c.mounts[i].BlockDeviceID) > 0 {
// Non-empty m.BlockDeviceID indicates there's already one device
// associated with the mount,so no need to create a new device for it
// and we only create block device for bind mount
continue
}
isBlockFile := HasOption(c.mounts[i].Options, vcAnnotations.IsFileBlockDevice)
if c.mounts[i].Type != "bind" && !isBlockFile {
// We only handle for bind and block device mounts.
continue
}
// Handle directly assigned volume. Update the mount info based on the mount info json.
mntInfo, e := volume.VolumeMountInfo(c.mounts[i].Source)
if e != nil && !os.IsNotExist(e) {
c.Logger().WithError(e).WithField("mount-source", c.mounts[i].Source).
Error("failed to parse the mount info file for a direct assigned volume")
continue
}
if mntInfo != nil {
// Write out sandbox info file on the mount source to allow CSI to communicate with the runtime
if err := volume.RecordSandboxId(c.sandboxID, c.mounts[i].Source); err != nil {
c.Logger().WithError(err).Error("error writing sandbox info")
}
readonly := false
for _, flag := range mntInfo.Options {
if flag == "ro" {
readonly = true
break
}
}
c.mounts[i].Source = mntInfo.Device
c.mounts[i].Type = mntInfo.FsType
c.mounts[i].Options = mntInfo.Options
c.mounts[i].ReadOnly = readonly
for key, value := range mntInfo.Metadata {
switch key {
case volume.FSGroupMetadataKey:
gid, err := strconv.Atoi(value)
if err != nil {
c.Logger().WithError(err).Errorf("invalid group id value %s provided for key %s", value, volume.FSGroupMetadataKey)
continue
}
c.mounts[i].FSGroup = &gid
case volume.FSGroupChangePolicyMetadataKey:
if _, exists := mntInfo.Metadata[volume.FSGroupMetadataKey]; !exists {
c.Logger().Errorf("%s specified without provding the group id with key %s", volume.FSGroupChangePolicyMetadataKey, volume.FSGroupMetadataKey)
continue
}
c.mounts[i].FSGroupChangePolicy = volume.FSGroupChangePolicy(value)
default:
c.Logger().Warnf("Ignoring unsupported direct-assignd volume metadata key: %s, value: %s", key, value)
}
}
}
// Check if mount is a block device file. If it is, the block device will be attached to the host
// instead of passing this as a shared mount.
di, err := c.createDeviceInfo(c.mounts[i].Source, c.mounts[i].Destination, c.mounts[i].ReadOnly, isBlockFile)
if err == nil && di != nil {
b, err := c.sandbox.devManager.NewDevice(*di)
if err != nil {
// Do not return an error, try to create
// devices for other mounts
c.Logger().WithError(err).WithField("mount-source", c.mounts[i].Source).
Error("device manager failed to create new device")
continue
}
c.mounts[i].BlockDeviceID = b.DeviceID()
}
}
return nil
}
func (c *Container) initConfigResourcesMemory() {
ociSpec := c.GetPatchedOCISpec()
c.config.Resources.Memory = &specs.LinuxMemory{}
ociSpec.Linux.Resources.Memory = c.config.Resources.Memory
}
// newContainer creates a Container structure from a sandbox and a container configuration.
func newContainer(ctx context.Context, sandbox *Sandbox, contConfig *ContainerConfig) (*Container, error) {
span, ctx := katatrace.Trace(ctx, nil, "newContainer", containerTracingTags, map[string]string{"container_id": contConfig.ID, "sandbox_id": sandbox.id})
defer span.End()
if !contConfig.valid() {
return &Container{}, fmt.Errorf("Invalid container configuration")
}
c := &Container{
id: contConfig.ID,
sandboxID: sandbox.id,
rootFs: contConfig.RootFs,
config: contConfig,
sandbox: sandbox,
containerPath: filepath.Join(sandbox.id, contConfig.ID),
rootfsSuffix: "rootfs",
state: types.ContainerState{},
process: Process{},
mounts: contConfig.Mounts,
ctx: sandbox.ctx,
}
// Set the Annotations of SWAP to Resources
if resourceSwappinessStr, ok := c.config.Annotations[vcAnnotations.ContainerResourcesSwappiness]; ok {
resourceSwappiness, err := strconv.ParseUint(resourceSwappinessStr, 0, 64)
if err == nil && resourceSwappiness > 200 {
err = fmt.Errorf("swapiness should not bigger than 200")
}
if err != nil {
return &Container{}, fmt.Errorf("Invalid container configuration Annotations %s %v", vcAnnotations.ContainerResourcesSwappiness, err)
}
if c.config.Resources.Memory == nil {
c.initConfigResourcesMemory()
}
c.config.Resources.Memory.Swappiness = &resourceSwappiness
}
if resourceSwapInBytesStr, ok := c.config.Annotations[vcAnnotations.ContainerResourcesSwapInBytes]; ok {
resourceSwapInBytesInUint, err := strconv.ParseUint(resourceSwapInBytesStr, 0, 64)
if err != nil {
return &Container{}, fmt.Errorf("Invalid container configuration Annotations %s %v", vcAnnotations.ContainerResourcesSwapInBytes, err)
}
if c.config.Resources.Memory == nil {
c.initConfigResourcesMemory()
}
resourceSwapInBytes := int64(resourceSwapInBytesInUint)
c.config.Resources.Memory.Swap = &resourceSwapInBytes
}
// experimental runtime use "persist.json" instead of legacy "state.json" as storage
err := c.Restore()
if err == nil {
//container restored
return c, nil
}
// Unexpected error
if !os.IsNotExist(err) && err != errContainerPersistNotExist {
return nil, err
}
// If mounts are block devices, add to devmanager
if err := c.createMounts(ctx); err != nil {
return nil, err
}
// Add container's devices to sandbox's device-manager
if err := c.createDevices(contConfig); err != nil {
return nil, err
}
return c, nil
}
// Create Device Information about the block device
func (c *Container) createDeviceInfo(source, destination string, readonly, isBlockFile bool) (*config.DeviceInfo, error) {
var stat unix.Stat_t
if err := unix.Stat(source, &stat); err != nil {
return nil, fmt.Errorf("stat %q failed: %v", source, err)
}
var di *config.DeviceInfo
var err error
if stat.Mode&unix.S_IFMT == unix.S_IFBLK {
di = &config.DeviceInfo{
HostPath: source,
ContainerPath: destination,
DevType: "b",
Major: int64(unix.Major(uint64(stat.Rdev))),
Minor: int64(unix.Minor(uint64(stat.Rdev))),
ReadOnly: readonly,
}
} else if isBlockFile && stat.Mode&unix.S_IFMT == unix.S_IFREG {
di = &config.DeviceInfo{
HostPath: source,
ContainerPath: destination,
DevType: "b",
Major: -1,
Minor: 0,
ReadOnly: readonly,
}
// Check whether source can be used as a pmem device
} else if di, err = config.PmemDeviceInfo(source, destination); err != nil {
c.Logger().WithError(err).
WithField("mount-source", source).
Debug("no loop device")
}
return di, err
}
// call hypervisor to create device about KataVirtualVolume.
func (c *Container) createVirtualVolumeDevices() ([]config.DeviceInfo, error) {
var deviceInfos []config.DeviceInfo
for _, o := range c.rootFs.Options {
if strings.HasPrefix(o, VirtualVolumePrefix) {
virtVolume, err := types.ParseKataVirtualVolume(strings.TrimPrefix(o, VirtualVolumePrefix))
if err != nil {
return nil, err
}
c.Logger().Infof("KataVirtualVolume volumetype = %s", virtVolume.VolumeType)
}
}
return deviceInfos, nil
}
func (c *Container) createMounts(ctx context.Context) error {
// Create block devices for newly created container
return c.createBlockDevices(ctx)
}
func (c *Container) createDevices(contConfig *ContainerConfig) error {
// If devices were not found in storage, create Device implementations
// from the configuration. This should happen at create.
var storedDevices []ContainerDevice
virtualVolumesDeviceInfos, err := c.createVirtualVolumeDevices()
if err != nil {
return err
}
deviceInfos := append(virtualVolumesDeviceInfos, contConfig.DeviceInfos...)
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort)
// Aggregate all the containner devices for hot-plug and use them to dedcue
// the correct amount of ports to reserve for the hypervisor.
hotPlugVFIO := (c.sandbox.config.HypervisorConfig.HotPlugVFIO != config.NoPort)
hotPlugDevices := []config.DeviceInfo{}
coldPlugDevices := []config.DeviceInfo{}
for i, vfio := range deviceInfos {
// Only considering VFIO updates for Port and ColdPlug or
// HotPlug updates
isVFIODevice := deviceManager.IsVFIODevice(vfio.ContainerPath)
if hotPlugVFIO && isVFIODevice {
deviceInfos[i].ColdPlug = false
deviceInfos[i].Port = c.sandbox.config.HypervisorConfig.HotPlugVFIO
hotPlugDevices = append(hotPlugDevices, deviceInfos[i])
continue
}
// Device is already cold-plugged at sandbox creation time
// ignore it for the container creation
if coldPlugVFIO && isVFIODevice {
coldPlugDevices = append(coldPlugDevices, deviceInfos[i])
continue
}
hotPlugDevices = append(hotPlugDevices, deviceInfos[i])
}
// If modeVFIO is enabled we need 1st to attach the VFIO control group
// device /dev/vfio/vfio an 2nd the actuall device(s) afterwards.
// Sort the devices starting with device #1 being the VFIO control group
// device and the next the actuall device(s) /dev/vfio/<group>
deviceInfos = sortContainerVFIODevices(hotPlugDevices)
for _, info := range deviceInfos {
dev, err := c.sandbox.devManager.NewDevice(info)
if err != nil {
return err
}
storedDevices = append(storedDevices, ContainerDevice{
ID: dev.DeviceID(),
ContainerPath: info.ContainerPath,
FileMode: info.FileMode,
UID: info.UID,
GID: info.GID,
})
}
c.devices = filterDevices(c, storedDevices)
// If we're hot-plugging this will be a no-op because at this stage
// no devices are attached to the root-port or switch-port
c.annotateContainerWithVFIOMetadata(coldPlugDevices)
return nil
}
// rollbackFailingContainerCreation rolls back important steps that might have
// been performed before the container creation failed.
// - Unplug CPU and memory resources from the VM.
// - Unplug devices from the VM.
func (c *Container) rollbackFailingContainerCreation(ctx context.Context) {
if err := c.detachDevices(ctx); err != nil {
c.Logger().WithError(err).Error("rollback failed detachDevices()")
}
if err := c.removeDrive(ctx); err != nil {
c.Logger().WithError(err).Error("rollback failed removeDrive()")
}
if err := c.unmountHostMounts(ctx); err != nil {
c.Logger().WithError(err).Error("rollback failed unmountHostMounts()")
}
if IsNydusRootFSType(c.rootFs.Type) {
if err := nydusContainerCleanup(ctx, getMountPath(c.sandbox.id), c); err != nil {
c.Logger().WithError(err).Error("rollback failed nydusContainerCleanup()")
}
} else {
if err := c.sandbox.fsShare.UnshareRootFilesystem(ctx, c); err != nil {
c.Logger().WithError(err).Error("rollback failed UnshareRootFilesystem()")
}
}
}
func (c *Container) checkBlockDeviceSupport(ctx context.Context) bool {
if !c.sandbox.config.HypervisorConfig.DisableBlockDeviceUse {
agentCaps := c.sandbox.agent.capabilities()
hypervisorCaps := c.sandbox.hypervisor.Capabilities(ctx)
if agentCaps.IsBlockDeviceSupported() && hypervisorCaps.IsBlockDeviceHotplugSupported() {
return true
}
}
return false
}
// Sort the devices starting with device #1 being the VFIO control group
// device and the next the actuall device(s) e.g. /dev/vfio/<group>
func sortContainerVFIODevices(devices []config.DeviceInfo) []config.DeviceInfo {
var vfioDevices []config.DeviceInfo
for _, device := range devices {
if deviceManager.IsVFIOControlDevice(device.ContainerPath) {
vfioDevices = append([]config.DeviceInfo{device}, vfioDevices...)
continue
}
vfioDevices = append(vfioDevices, device)
}
return vfioDevices
}
type DeviceRelation struct {
Bus string
Path string
Index int
}
// Depending on the HW we might need to inject metadata into the container
// In this case for the NV GPU we need to provide the correct mapping from
// VFIO-<NUM> to GPU index inside of the VM when vfio_mode="guest-kernel",
// otherwise we do not know which GPU is which.
func (c *Container) annotateContainerWithVFIOMetadata(devices interface{}) {
modeIsGK := (c.sandbox.config.VfioMode == config.VFIOModeGuestKernel)
if modeIsGK {
// Hot plug is done let's update meta information about the
// hot plugged devices especially VFIO devices in modeIsGK
siblings := make([]DeviceRelation, 0)
// In the sandbox we first create the root-ports and secondly
// the switch-ports. The range over map is not deterministic
// so lets first iterate over all root-port devices and then
// switch-port devices no special handling for bridge-port (PCI)
for _, dev := range config.PCIeDevicesPerPort["root-port"] {
// For the NV GPU we need special handling let's use only those
if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") {
siblings = append(siblings, DeviceRelation{Bus: dev.Bus, Path: dev.HostPath})
}
}
for _, dev := range config.PCIeDevicesPerPort["switch-port"] {
// For the NV GPU we need special handling let's use only those
if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") {
siblings = append(siblings, DeviceRelation{Bus: dev.Bus, Path: dev.HostPath})
}
}
// We need to sort the VFIO devices by bus to get the correct
// ordering root-port < switch-port
sort.Slice(siblings, func(i, j int) bool {
return siblings[i].Bus < siblings[j].Bus
})
for i := range siblings {
siblings[i].Index = i
}
// Now that we have the index lets connect the /dev/vfio/<num>
// to the correct index
if devices, ok := devices.([]ContainerDevice); ok {
for _, dev := range devices {
c.siblingAnnotation(dev.ContainerPath, siblings)
}
}
if devices, ok := devices.([]config.DeviceInfo); ok {
for _, dev := range devices {
c.siblingAnnotation(dev.ContainerPath, siblings)