Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
fajpunk committed Sep 27, 2024
1 parent 69f063b commit cb7e0a9
Show file tree
Hide file tree
Showing 17 changed files with 430 additions and 56 deletions.
27 changes: 27 additions & 0 deletions applications/mobu/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,20 @@ spec:
containers:
- name: {{ .Chart.Name }}
env:
- name: "KAFKA_SECURITY_PROTOCOL"
value: "SSL"
# From KafkaAccess
- name: "KAFKA_BOOTSTRAP_SERVERS"
valueFrom:
secretKeyRef:
name: mobu-kafka
key: "bootstrapServers"
- name: "KAFKA_CLUSTER_CA_PATH"
value: "/etc/kafkacluster/ca.crt"
- name: "KAFKA_CLIENT_CERT_PATH"
value: "/etc/kafkauser/user.crt"
- name: "KAFKA_CLIENT_KEY_PATH"
value: "/etc/kafkauser/user.key"
{{- if .Values.config.slackAlerts }}
- name: "MOBU_ALERT_HOOK"
valueFrom:
Expand Down Expand Up @@ -103,6 +117,15 @@ spec:
- ALL
readOnlyRootFilesystem: true
volumeMounts:
- name: "kafka"
mountPath: "/etc/kafkacluster/ca.crt"
subPath: "ssl.truststore.crt" # CA cert from the Kafka cluster
- name: "kafka"
mountPath: "/etc/kafkauser/user.crt"
subPath: "ssl.keystore.crt" # User cert from the Kafka cluster signed by the clients' CA
- name: "kafka"
mountPath: "/etc/kafkauser/user.key"
subPath: "ssl.keystore.key" # private key for the consuming client
- name: "config"
mountPath: "/etc/mobu"
readOnly: true
Expand All @@ -113,6 +136,10 @@ spec:
runAsUser: 1000
runAsGroup: 1000
volumes:
# This secret comes from the KafkaAccess operator
- name: "kafka"
secret:
secretName: mobu-kafka
- name: "config"
projected:
sources:
Expand Down
14 changes: 14 additions & 0 deletions applications/mobu/templates/kafkaaccess.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: access.strimzi.io/v1alpha1
kind: KafkaAccess
metadata:
name: mobu-kafka
spec:
kafka:
name: sasquatch
namespace: sasquatch
listener: tls
user:
kind: KafkaUser
apiGroup: kafka.strimzi.io
name: mobu
namespace: sasquatch
3 changes: 3 additions & 0 deletions applications/sasquatch/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ dependencies:
- name: square-events
condition: squareEvents.enabled
version: 1.0.0
- name: app-metrics
condition: appMetrics.enabled
version: 1.0.0

annotations:
phalanx.lsst.io/docs: |
Expand Down
29 changes: 28 additions & 1 deletion applications/sasquatch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Rubin Observatory's telemetry service
| global.baseUrl | string | Set by Argo CD | Base URL for the environment |
| global.host | string | Set by Argo CD | Host name for ingress |
| global.vaultSecretsPath | string | Set by Argo CD | Base path for Vault secrets |
| appMetrics.enabled | bool | `false` | Enable the AppMetrics subchart with topic, user, and telegraf configurations |
| chronograf.enabled | bool | `true` | Whether Chronograf is enabled |
| chronograf.env | object | See `values.yaml` | Additional environment variables for Chronograf |
| chronograf.envFromSecret | string | `"sasquatch"` | Name of secret to use. The keys `generic_client_id`, `generic_client_secret`, and `token_secret` should be set. |
Expand Down Expand Up @@ -81,6 +82,33 @@ Rubin Observatory's telemetry service
| strimzi-registry-operator.clusterNamespace | string | `"sasquatch"` | Namespace where the Strimzi Kafka cluster is deployed |
| strimzi-registry-operator.operatorNamespace | string | `"sasquatch"` | Namespace where the strimzi-registry-operator is deployed |
| telegraf-kafka-consumer | object | `{}` | Overrides for telegraf-kafka-consumer configuration |
| app-metrics.affinity | object | `{}` | Affinity for pod assignment |
| app-metrics.args | list | `[]` | Arguments passed to the Telegraf agent containers |
| app-metrics.cluster.name | string | `"sasquatch"` | |
| app-metrics.collection_jitter | string | "0s" | Data collection jitter. This is used to jitter the collection by a random amount. Each plugin will sleep for a random time within jitter before collecting. |
| app-metrics.debug | bool | false | Run Telegraf in debug mode. |
| app-metrics.env | list | See `values.yaml` | Telegraf agent enviroment variables |
| app-metrics.envFromSecret | string | `""` | Name of the secret with values to be added to the environment. |
| app-metrics.flush_interval | string | "10s" | Data flushing interval for all outputs. Don’t set this below interval. Maximum flush_interval is flush_interval + flush_jitter |
| app-metrics.flush_jitter | string | "0s" | Jitter the flush interval by a random amount. This is primarily to avoid large write spikes for users running a large number of telegraf instances. |
| app-metrics.image.pullPolicy | string | `"Always"` | Image pull policy |
| app-metrics.image.repo | string | `"docker.io/library/telegraf"` | Telegraf image repository |
| app-metrics.image.tag | string | `"1.30.2-alpine"` | Telegraf image tag |
| app-metrics.imagePullSecrets | list | `[]` | Secret names to use for Docker pulls |
| app-metrics.influxdb.database | string | `"telegraf-kafka-app-metrics-consumer-v1"` | Name of the InfluxDB v1 database to write to |
| app-metrics.influxdb.url | string | `"http://sasquatch-influxdb.sasquatch:8086"` | URL of the InfluxDB v1 instance to write to |
| app-metrics.metric_batch_size | int | 5000 | Sends metrics to the output in batches of at most metric_batch_size metrics. |
| app-metrics.metric_buffer_limit | int | 100000 | Caches metric_buffer_limit metrics for each output, and flushes this buffer on a successful write. This should be a multiple of metric_batch_size and could not be less than 2 times metric_batch_size. |
| app-metrics.metricsApps[0] | string | `"mobu"` | |
| app-metrics.metricsApps[1] | string | `"some-other-app"` | |
| app-metrics.metricsApps[2] | string | `"yet-another-app"` | |
| app-metrics.metricsTags | string | `"[ \"service\", \"username\" ]\n"` | |
| app-metrics.nodeSelector | object | `{}` | Node labels for pod assignment |
| app-metrics.podAnnotations | object | `{}` | Annotations for telegraf-kafka-consumers pods |
| app-metrics.podLabels | object | `{}` | Labels for telegraf-kafka-consumer pods |
| app-metrics.replicaCount | int | `1` | Number of Telegraf replicas. Increase this value to increase the consumer throughput. |
| app-metrics.resources | object | See `values.yaml` | Kubernetes resources requests and limits |
| app-metrics.tolerations | list | `[]` | Tolerations for pod assignment |
| influxdb-enterprise.bootstrap.auth.secretName | string | `"sasquatch"` | Enable authentication of the data nodes using this secret, by creating a username and password for an admin account. The secret must contain keys `username` and `password`. |
| influxdb-enterprise.bootstrap.ddldml.configMap | string | Do not run DDL or DML | A config map containing DDL and DML that define databases, retention policies, and inject some data. The keys `ddl` and `dml` must exist, even if one of them is empty. DDL is executed before DML to ensure databases and retention policies exist. |
| influxdb-enterprise.bootstrap.ddldml.resources | object | `{}` | Kubernetes resources and limits for the bootstrap job |
Expand Down Expand Up @@ -389,7 +417,6 @@ Rubin Observatory's telemetry service
| strimzi-kafka.registry.resources | object | See `values.yaml` | Kubernetes requests and limits for the Schema Registry |
| strimzi-kafka.registry.schemaTopic | string | `"registry-schemas"` | Name of the topic used by the Schema Registry |
| strimzi-kafka.superusers | list | `["kafka-admin"]` | A list of usernames for users who should have global admin permissions. These users will be created, along with their credentials. |
| strimzi-kafka.users.appmetrics.enabled | bool | `false` | Enable user appmetrics |
| strimzi-kafka.users.camera.enabled | bool | `false` | Enable user camera, used at the camera environments |
| strimzi-kafka.users.consdb.enabled | bool | `false` | Enable user consdb |
| strimzi-kafka.users.kafdrop.enabled | bool | `false` | Enable user Kafdrop (deployed by parent Sasquatch chart). |
Expand Down
6 changes: 6 additions & 0 deletions applications/sasquatch/charts/app-metrics/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v2
name: app-metrics
version: 1.0.0
appVersion: "1.0.0"
description: Kafka topics, users, and a telegraf connector for metrics events.
type: application
35 changes: 35 additions & 0 deletions applications/sasquatch/charts/app-metrics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# app-metrics

Kafka topics, users, and a telegraf connector for metrics events.

## Values

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| affinity | object | `{}` | Affinity for pod assignment |
| args | list | `[]` | Arguments passed to the Telegraf agent containers |
| cluster.name | string | `"sasquatch"` | |
| collection_jitter | string | "0s" | Data collection jitter. This is used to jitter the collection by a random amount. Each plugin will sleep for a random time within jitter before collecting. |
| debug | bool | false | Run Telegraf in debug mode. |
| env | list | See `values.yaml` | Telegraf agent enviroment variables |
| envFromSecret | string | `""` | Name of the secret with values to be added to the environment. |
| flush_interval | string | "10s" | Data flushing interval for all outputs. Don’t set this below interval. Maximum flush_interval is flush_interval + flush_jitter |
| flush_jitter | string | "0s" | Jitter the flush interval by a random amount. This is primarily to avoid large write spikes for users running a large number of telegraf instances. |
| image.pullPolicy | string | `"Always"` | Image pull policy |
| image.repo | string | `"docker.io/library/telegraf"` | Telegraf image repository |
| image.tag | string | `"1.30.2-alpine"` | Telegraf image tag |
| imagePullSecrets | list | `[]` | Secret names to use for Docker pulls |
| influxdb.database | string | `"telegraf-kafka-app-metrics-consumer-v1"` | Name of the InfluxDB v1 database to write to |
| influxdb.url | string | `"http://sasquatch-influxdb.sasquatch:8086"` | URL of the InfluxDB v1 instance to write to |
| metric_batch_size | int | 5000 | Sends metrics to the output in batches of at most metric_batch_size metrics. |
| metric_buffer_limit | int | 100000 | Caches metric_buffer_limit metrics for each output, and flushes this buffer on a successful write. This should be a multiple of metric_batch_size and could not be less than 2 times metric_batch_size. |
| metricsApps[0] | string | `"mobu"` | |
| metricsApps[1] | string | `"some-other-app"` | |
| metricsApps[2] | string | `"yet-another-app"` | |
| metricsTags | string | `"[ \"service\", \"username\" ]\n"` | |
| nodeSelector | object | `{}` | Node labels for pod assignment |
| podAnnotations | object | `{}` | Annotations for telegraf-kafka-consumers pods |
| podLabels | object | `{}` | Labels for telegraf-kafka-consumer pods |
| replicaCount | int | `1` | Number of Telegraf replicas. Increase this value to increase the consumer throughput. |
| resources | object | See `values.yaml` | Kubernetes resources requests and limits |
| tolerations | list | `[]` | Tolerations for pod assignment |
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{{- range .Values.metricsApps }}
---
apiVersion: kafka.strimzi.io/v1beta2
kind: KafkaTopic
metadata:
name: "lsst.square.app-metrics.events.{{ . }}"
labels:
strimzi.io/cluster: {{ $.Values.cluster.name }}
spec:
partitions: 10
replicas: 3
config:
# http://kafka.apache.org/documentation/#topicconfigs
retention.ms: 86400000 # 1 day
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{{- range .Values.metricsApps }}
---
apiVersion: kafka.strimzi.io/v1beta2
kind: KafkaUser
metadata:
name: square-app-metrics-{{ . }}
labels:
strimzi.io/cluster: {{ $.Values.cluster.name }}
spec:
authentication:
type: tls
authorization:
type: simple
acls:
- resource:
type: group
name: app-metrics-events
patternType: prefix
operations:
- "Read"
host: "*"
- resource:
type: topic
name: "lsst.square.app-metrics.events.{{ . }}"
patternType: literal
operations:
- "Describe"
- "Read"
- "Write"
host: "*"
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: sasquatch-telegraf-app-metrics
labels:
app.kubernetes.io/name: sasquatch-telegraf
app.kubernetes.io/instance: sasquatch-telegraf-app-metrics
app.kubernetes.io/part-of: sasquatch
data:
telegraf.conf: |+
[agent]
metric_batch_size = 5000
metric_buffer_limit = 100000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
debug = {{ default false .Values.debug }}
omit_hostname = true
[[outputs.influxdb]]
urls = [
{{ .Values.influxdb.url | quote }}
]
database = {{ .Values.influxdb.database | quote }}
username = "${INFLUXDB_USER}"
password = "${INFLUXDB_PASSWORD}"
[[outputs.influxdb]]
namepass = ["telegraf_*"]
urls = [
{{ .Values.influxdb.url | quote }}
]
database = "telegraf"
username = "${INFLUXDB_USER}"
password = "${INFLUXDB_PASSWORD}"
[[inputs.kafka_consumer]]
brokers = [
"sasquatch-kafka-brokers.sasquatch:9092"
]
consumer_group = "telegraf-kafka-consumer-app-metrics"
sasl_mechanism = "SCRAM-SHA-512"
sasl_password = "$TELEGRAF_PASSWORD"
sasl_username = "telegraf"
data_format = "avro"
avro_schema_registry = "http://sasquatch-schema-registry.sasquatch:8081"
avro_timestamp = "timestamp_ns"
avro_timestamp_format = "unix_ns"
avro_union_mode = "nullable"
avro_tags = {{ .Values.metricsTags }}
topic_regexps = [ "lsst.square.app-metrics.events.*" ]
max_processing_time = "5s"
consumer_fetch_default = "5MB"
max_undelivered_messages = 10000
compression_codec = 3
[[inputs.internal]]
name_prefix = "telegraf_"
collect_memstats = true
tags = { instance = "app-metrics" }
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: sasquatch-telegraf-app-metrics
labels:
app.kubernetes.io/name: sasquatch-telegraf
app.kubernetes.io/instance: sasquatch-telegraf-app-metrics
app.kubernetes.io/part-of: sasquatch
spec:
replicas: {{ default 1 .Values.replicaCount }}
selector:
matchLabels:
app.kubernetes.io/instance: sasquatch-telegraf-app-metrics
template:
metadata:
labels:
app.kubernetes.io/instance: sasquatch-telegraf-app-metrics
annotations:
checksum/apps: {{ .Values.metricsApps | toString | sha256sum }}
{{- if .Values.podAnnotations }}
{{- toYaml .Values.podAnnotations | nindent 8 }}
{{- end }}
spec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
containers:
- name: telegraf
securityContext:
capabilities:
drop:
- all
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
image: "{{ .Values.image.repo }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ default "IfNotPresent" .Values.image.pullPolicy | quote }}
{{- if .Values.resources }}
resources:
{{- toYaml .Values.resources | nindent 10 }}
{{- end }}
{{- if .Values.args }}
args:
{{- toYaml .Values.args | nindent 8 }}
{{- end }}
{{- if .Values.env }}
env:
{{- toYaml .Values.env | nindent 8 }}
{{- end }}
{{- if .Values.envFromSecret }}
envFrom:
- secretRef:
name: {{ .Values.envFromSecret }}
{{- end }}
volumeMounts:
- name: config
mountPath: /etc/telegraf
{{- if .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml .Values.imagePullSecrets | nindent 8 }}
{{- end }}
{{- if .Values.nodeSelector }}
nodeSelector:
{{- toYaml .Values.nodeSelector | nindent 8 }}
{{- end }}
{{- if .Values.affinity }}
affinity:
{{- toYaml .Values.affinity | nindent 8 }}
{{- end }}
{{- if .Values.tolerations }}
tolerations:
{{- toYaml .Values.tolerations | nindent 8 }}
{{- end }}
volumes:
- name: config
configMap:
name: sasquatch-telegraf-app-metrics
Loading

0 comments on commit cb7e0a9

Please sign in to comment.