From f335e36e98076657f6a0c7ffba0877b3995f8f28 Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Thu, 20 Apr 2023 09:39:31 +0200 Subject: [PATCH 01/10] Add build information to the metrics/queries This allows use cases where we can see and query the changes in metrics after a version change in the code base. Ref: https://github.com/autometrics-dev/autometrics-rs/pull/69 Ref: https://github.com/orgs/autometrics-dev/discussions/10 --- README.md | 35 ++++++++++++++-- examples/otel/cmd/main.go | 13 +++++- examples/web/cmd/main.go | 13 +++++- examples/web/cmd/main.go.orig | 13 +++++- internal/autometrics/doc.go | 53 ++++++++++++++++++++++-- pkg/autometrics/global_state.go | 35 ++++++++++++++++ pkg/autometrics/main.go | 38 +++++++++++++---- pkg/autometrics/otel/instrument.go | 17 +++++++- pkg/autometrics/otel/otel.go | 37 ++++++++++++++++- pkg/autometrics/prometheus/instrument.go | 29 +++++++++---- pkg/autometrics/prometheus/prometheus.go | 42 ++++++++++++++++--- 11 files changed, 289 insertions(+), 36 deletions(-) create mode 100644 pkg/autometrics/global_state.go diff --git a/README.md b/README.md index 6a39b64..9b5f823 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,18 @@ import ( And then in your main function initialize the metrics ``` go -amImpl.Init(nil, am.DefBuckets) + // Everything in BuildInfo is optional. + // You can also use any string variable whose value is + // injected at build time by ldflags. + amImpl.Init( + nil, + amImpl.DefBuckets, + amImpl.BuildInfo{ + Version: "0.4.0", + Commit: "anySHA", + BuildTime: "", + }, + ) ``` > **Warning** @@ -134,7 +145,15 @@ import ( func main() { - amImpl.Init(nil, am.DefBuckets) + amImpl.Init( + nil, + amImpl.DefBuckets, + amImpl.BuildInfo{ + Version: "0.4.0", + Commit: "anySHA", + BuildTime: "", + }, + ) http.Handle("/metrics", promhttp.Handler()) } ``` @@ -188,8 +207,16 @@ the `Init` function takes a meter name for the `otel_scope` label of the exporte metric. You can use the name of the application or its version for example ``` patch -- amImpl.Init(nil, am.DefBuckets) -+ amImpl.Init("myApp/v2/prod", am.DefBuckets) + amImpl.Init( +- nil, ++ "myApp/v2/prod", + amImpl.DefBuckets, + amImpl.BuildInfo{ + Version: "2.1.37", + Commit: "anySHA", + BuildTime: "", + }, + ) ``` - add the `-otel` flag to the `//go:generate` directive diff --git a/examples/otel/cmd/main.go b/examples/otel/cmd/main.go index ae84fef..e171565 100644 --- a/examples/otel/cmd/main.go +++ b/examples/otel/cmd/main.go @@ -19,7 +19,18 @@ import ( func main() { rand.Seed(time.Now().UnixNano()) - amImpl.Init("web-server", amImpl.DefBuckets) + // Everything in BuildInfo is optional. + // You can also use any string variable whose value is + // injected at build time by ldflags. + amImpl.Init( + "web-server", + amImpl.DefBuckets, + amImpl.BuildInfo{ + Version: "0.4.0", + Commit: "anySHA", + BuildTime: "", + }, + ) http.HandleFunc("/", errorable(indexHandler)) http.HandleFunc("/random-error", errorable(randomErrorHandler)) diff --git a/examples/web/cmd/main.go b/examples/web/cmd/main.go index f0b1b4d..e9260ca 100644 --- a/examples/web/cmd/main.go +++ b/examples/web/cmd/main.go @@ -19,7 +19,18 @@ import ( func main() { rand.Seed(time.Now().UnixNano()) - amImpl.Init(nil, amImpl.DefBuckets) + // Everything in BuildInfo is optional. + // You can also use any string variable whose value is + // injected at build time by ldflags. + amImpl.Init( + nil, + amImpl.DefBuckets, + amImpl.BuildInfo{ + Version: "0.4.0", + Commit: "anySHA", + BuildTime: "", + }, + ) http.HandleFunc("/", errorable(indexHandler)) http.HandleFunc("/random-error", errorable(randomErrorHandler)) diff --git a/examples/web/cmd/main.go.orig b/examples/web/cmd/main.go.orig index 99e9f76..2fe7a85 100644 --- a/examples/web/cmd/main.go.orig +++ b/examples/web/cmd/main.go.orig @@ -19,7 +19,18 @@ import ( func main() { rand.Seed(time.Now().UnixNano()) - amImpl.Init(nil, amImpl.DefBuckets) + // Everything in BuildInfo is optional. + // You can also use any string variable whose value is + // injected at build time by ldflags. + amImpl.Init( + nil, + amImpl.DefBuckets, + amImpl.BuildInfo{ + Version: "0.4.0", + Commit: "anySHA", + BuildTime: "", + }, + ) http.HandleFunc("/", errorable(indexHandler)) http.HandleFunc("/random-error", errorable(randomErrorHandler)) diff --git a/internal/autometrics/doc.go b/internal/autometrics/doc.go index 5a41ed1..6efa99c 100644 --- a/internal/autometrics/doc.go +++ b/internal/autometrics/doc.go @@ -46,22 +46,67 @@ func (p Prometheus) makePrometheusUrl(query, comment string) url.URL { return ret } +func addBuildInfoLabels() string { + return fmt.Sprintf("* on (instance, job) group_left(%s, %s) %s", + prometheus.VersionLabel, + prometheus.CommitLabel, + prometheus.BuildInfoName, + ) +} + func requestRateQuery(counterName, labelKey, labelValue string) string { - return fmt.Sprintf("sum by (%s, %s) (rate(%s{%s=\"%s\"}[5m]))", prometheus.FunctionLabel, prometheus.ModuleLabel, counterName, labelKey, labelValue) + return fmt.Sprintf("sum by (%s, %s, %s, %s) (rate(%s{%s=\"%s\"}[5m]) %s)", + prometheus.FunctionLabel, + prometheus.ModuleLabel, + prometheus.VersionLabel, + prometheus.CommitLabel, + counterName, + labelKey, + labelValue, + addBuildInfoLabels(), + ) } func errorRatioQuery(counterName, labelKey, labelValue string) string { - return fmt.Sprintf("sum by (%s, %s) (rate(%s{%s=\"%s\",%s=\"error\"}[5m]))", prometheus.FunctionLabel, prometheus.ModuleLabel, counterName, labelKey, labelValue, prometheus.ResultLabel) + return fmt.Sprintf("sum by (%s, %s, %s, %s) (rate(%s{%s=\"%s\",%s=\"error\"}[5m]) %s)", + prometheus.FunctionLabel, + prometheus.ModuleLabel, + prometheus.VersionLabel, + prometheus.CommitLabel, + counterName, + labelKey, + labelValue, + prometheus.ResultLabel, + addBuildInfoLabels(), + ) } func latencyQuery(bucketName, labelKey, labelValue string) string { - latency := fmt.Sprintf("sum by (le, %s, %s) (rate(%s_bucket{%s=\"%s\"}[5m]))", prometheus.FunctionLabel, prometheus.ModuleLabel, bucketName, labelKey, labelValue) + latency := fmt.Sprintf("sum by (le, %s, %s, %s, %s) (rate(%s_bucket{%s=\"%s\"}[5m]) %s)", + prometheus.FunctionLabel, + prometheus.ModuleLabel, + prometheus.VersionLabel, + prometheus.CommitLabel, + bucketName, + labelKey, + labelValue, + addBuildInfoLabels(), + ) return fmt.Sprintf("histogram_quantile(0.99, %s) or histogram_quantile(0.95, %s)", latency, latency) } func concurrentCallsQuery(gaugeName, labelKey, labelValue string) string { - return fmt.Sprintf("sum by (%s, %s) %s{%s=\"%s\"}", prometheus.FunctionLabel, prometheus.ModuleLabel, gaugeName, labelKey, labelValue) + return fmt.Sprintf("sum by (%s, %s, %s, %s) (%s{%s=\"%s\"} %s)", + prometheus.FunctionLabel, + prometheus.ModuleLabel, + prometheus.VersionLabel, + prometheus.CommitLabel, + gaugeName, + labelKey, + labelValue, + addBuildInfoLabels(), + ) } func (p Prometheus) GenerateAutometricsComment(ctx GeneratorContext, funcName, moduleName string) []string { diff --git a/pkg/autometrics/global_state.go b/pkg/autometrics/global_state.go new file mode 100644 index 0000000..10fb715 --- /dev/null +++ b/pkg/autometrics/global_state.go @@ -0,0 +1,35 @@ +package autometrics // import "github.com/autometrics-dev/autometrics-go/pkg/autometrics" + +var version string +var commit string +var buildTime string + +// GetVersion returns the version of the codebase being instrumented. +func GetVersion() string { + return version +} + +// SetVersion sets the version of the codebase being instrumented. +func SetVersion(newVersion string) { + version = newVersion +} + +// GetCommit returns the commit of the codebase being instrumented. +func GetCommit() string { + return commit +} + +// SetCommit sets the commit of the codebase being instrumented. +func SetCommit(newCommit string) { + commit = newCommit +} + +// GetBuildTime returns the build timestamp of the codebase being instrumented. +func GetBuildTime() string { + return buildTime +} + +// SetBuildTime sets the build timestamp of the codebase being instrumented. +func SetBuildTime(newBuildTime string) { + buildTime = newBuildTime +} diff --git a/pkg/autometrics/main.go b/pkg/autometrics/main.go index 17bc033..6ba1f27 100644 --- a/pkg/autometrics/main.go +++ b/pkg/autometrics/main.go @@ -1,4 +1,4 @@ -package autometrics +package autometrics // import "github.com/autometrics-dev/autometrics-go/pkg/autometrics" import ( "context" @@ -38,19 +38,25 @@ type Context struct { TrackCallerName bool // AlertConf is an optional configuration to add alerting capabilities to the metrics. AlertConf *AlertConfiguration - // startTime is the start time of a single function execution. - // Only autometrics.Instrument should read this value. - // Only autometrics.PreInstrument should write this value. + // StartTime is the start time of a single function execution. + // Only amImpl.Instrument should read this value. + // Only amImpl.PreInstrument should write this value. // // This value is only exported for the child packages "prometheus" and "otel" StartTime time.Time - // callInfo contains all the relevant data for caller information. - // Only autometrics.Instrument should read this value. - // Only autometrics.PreInstrument should write/read this value. + // CallInfo contains all the relevant data for caller information. + // Only amImpl.Instrument should read this value. + // Only amImpl.PreInstrument should write/read this value. // // This value is only exported for the child packages "prometheus" and "otel" CallInfo CallInfo - Context context.Context + // BuildInfo contains all the relevant data for caller information. + // Only amImpl.Instrument and PreInstrument should read this value. + // Only amImpl.Init should write/read this value. + // + // This value is only exported for the child packages "prometheus" and "otel" + BuildInfo BuildInfo + Context context.Context } // CallInfo holds the information about the current function call and its parent names. @@ -65,6 +71,16 @@ type CallInfo struct { ParentModuleName string } +// BuildInfo holds the information about the current build of the instrumented code. +type BuildInfo struct { + // Commit is the commit of the code. + Commit string + // Version is the version of the code. + Version string + // BuildTime is the timestamp of the build of the codebase. + BuildTime string +} + func NewContext() Context { return Context{ TrackConcurrentCalls: true, @@ -74,6 +90,12 @@ func NewContext() Context { } } +func (c *Context) FillBuildInfo() { + c.BuildInfo.Version = GetVersion() + c.BuildInfo.Commit = GetCommit() + c.BuildInfo.BuildTime = GetBuildTime() +} + func (c Context) Validate(allowCustomLatencies bool) error { if c.AlertConf != nil { if c.AlertConf.ServiceName == "" { diff --git a/pkg/autometrics/otel/instrument.go b/pkg/autometrics/otel/instrument.go index def0639..cf2a8aa 100644 --- a/pkg/autometrics/otel/instrument.go +++ b/pkg/autometrics/otel/instrument.go @@ -15,7 +15,7 @@ import ( // // The first argument SHOULD be a call to PreInstrument so that // the "concurrent calls" gauge is correctly setup. -func Instrument(ctx *autometrics.Context, err *error) { +func Instrument(ctx *autometrics.Context, err *error) { result := "ok" if err != nil && *err != nil { @@ -49,6 +49,9 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(ResultLabel).String(result), attribute.Key(TargetSuccessRateLabel).String(successObjective), attribute.Key(SloNameLabel).String(sloName), + attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), + attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), + attribute.Key(BuildTimeLabel).String(ctx.BuildInfo.BuildTime), }...) functionCallsDuration.Record(ctx.Context, time.Since(ctx.StartTime).Seconds(), []attribute.KeyValue{ @@ -58,6 +61,9 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(TargetLatencyLabel).String(latencyTarget), attribute.Key(TargetSuccessRateLabel).String(latencyObjective), attribute.Key(SloNameLabel).String(sloName), + attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), + attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), + attribute.Key(BuildTimeLabel).String(ctx.BuildInfo.BuildTime), }...) if ctx.TrackConcurrentCalls { @@ -66,6 +72,9 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(FunctionLabel).String(ctx.CallInfo.FuncName), attribute.Key(ModuleLabel).String(ctx.CallInfo.ModuleName), attribute.Key(CallerLabel).String(callerLabel), + attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), + attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), + attribute.Key(BuildTimeLabel).String(ctx.BuildInfo.BuildTime), }...) } } @@ -74,8 +83,9 @@ func Instrument(ctx *autometrics.Context, err *error) { // // It is meant to be called as the first argument to Instrument in a // defer call. -func PreInstrument(ctx *autometrics.Context) *autometrics.Context { +func PreInstrument(ctx *autometrics.Context) *autometrics.Context { ctx.CallInfo = autometrics.CallerInfo() + ctx.FillBuildInfo() ctx.Context = context.Background() var callerLabel string @@ -89,6 +99,9 @@ func PreInstrument(ctx *autometrics.Context) *autometrics.Context { attribute.Key(FunctionLabel).String(ctx.CallInfo.FuncName), attribute.Key(ModuleLabel).String(ctx.CallInfo.ModuleName), attribute.Key(CallerLabel).String(callerLabel), + attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), + attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), + attribute.Key(BuildTimeLabel).String(ctx.BuildInfo.BuildTime), }...) } diff --git a/pkg/autometrics/otel/otel.go b/pkg/autometrics/otel/otel.go index bd971ff..e148e09 100644 --- a/pkg/autometrics/otel/otel.go +++ b/pkg/autometrics/otel/otel.go @@ -1,10 +1,12 @@ package otel // import "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" import ( + "context" "fmt" "github.com/autometrics-dev/autometrics-go/pkg/autometrics" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/prometheus" "go.opentelemetry.io/otel/metric/instrument" "go.opentelemetry.io/otel/sdk/instrumentation" @@ -16,6 +18,7 @@ var ( functionCallsCount instrument.Int64UpDownCounter functionCallsDuration instrument.Float64Histogram functionCallsConcurrent instrument.Int64UpDownCounter + buildInfo instrument.Int64UpDownCounter DefBuckets = autometrics.DefBuckets ) @@ -26,6 +29,8 @@ const ( FunctionCallsDurationName = "function.calls.duration" // FunctionCallsConcurrentName is the name of the openTelemetry metric for the number of simulateneously active calls to specific functions. FunctionCallsConcurrentName = "function.calls.concurrent" + // BuildInfo is the name of the openTelemetry metric for the version of the monitored codebase. + BuildInfoName = "build_info" // FunctionLabel is the openTelemetry attribute that describes the function name. // @@ -56,20 +61,36 @@ const ( TargetSuccessRateLabel = "objective.percentile" // SloLabelName is the openTelemetry attribute that describes the name of the Service Level Objective. SloNameLabel = "objective.name" -) + // CommitLabel is the openTelemetry attribute that describes the commit of the monitored codebase. + CommitLabel = "commit" + // VersionLabel is the openTelemetry attribute that describes the version of the monitored codebase. + VersionLabel = "version" + // BuildTimeLabel is the openTelemetry attribute that describes the timestamp of the build of the monitored codebase. + BuildTimeLabel = "build_time" +) func completeMeterName(meterName string) string { return fmt.Sprintf("autometrics/%v", meterName) } +// BuildInfo holds meta information about the build of the instrumented code. +// +// This is a reexport of the autometrics type to allow [Init] to work with only +// the current (prometheus) package imported at the call site. +type BuildInfo = autometrics.BuildInfo + // Init sets up the metrics required for autometrics' decorated functions and registers // them to the Prometheus exporter // // Make sure that all the latency targets you want to use for SLOs are // present in the histogramBuckets array, otherwise the alerts will fail // to work (they will never trigger.) -func Init(meterName string, histogramBuckets []float64) error { +func Init(meterName string, histogramBuckets []float64, buildInformation BuildInfo) error { + autometrics.SetCommit(buildInformation.Commit) + autometrics.SetVersion(buildInformation.Version) + autometrics.SetBuildTime(buildInformation.BuildTime) + exporter, err := prometheus.New( // The units are removed from the exporter so that the names of the // exported metrics after the View rename are consistent with the @@ -114,5 +135,17 @@ func Init(meterName string, histogramBuckets []float64) error { return fmt.Errorf("error initializing %v metric: %w", FunctionCallsConcurrentName, err) } + buildInfo, err = meter.Int64UpDownCounter(BuildInfoName, instrument.WithDescription("The information of the current build.")) + if err != nil { + return fmt.Errorf("error initializing %v metric: %w", BuildInfoName, err) + } + + buildInfo.Add(context.Background(), 1, + []attribute.KeyValue{ + attribute.Key(CommitLabel).String(buildInformation.Commit), + attribute.Key(VersionLabel).String(buildInformation.Version), + attribute.Key(BuildTimeLabel).String(buildInformation.BuildTime), + }...) + return nil } diff --git a/pkg/autometrics/prometheus/instrument.go b/pkg/autometrics/prometheus/instrument.go index 1d8b803..e710732 100644 --- a/pkg/autometrics/prometheus/instrument.go +++ b/pkg/autometrics/prometheus/instrument.go @@ -14,7 +14,7 @@ import ( // // The first argument SHOULD be a call to PreInstrument so that // the "concurrent calls" gauge is correctly setup. -func Instrument(ctx *autometrics.Context, err *error) { +func Instrument(ctx *autometrics.Context, err *error) { result := "ok" if err != nil && *err != nil { @@ -47,6 +47,9 @@ func Instrument(ctx *autometrics.Context, err *error) { ResultLabel: result, TargetSuccessRateLabel: successObjective, SloNameLabel: sloName, + BuildTimeLabel: ctx.BuildInfo.BuildTime, + CommitLabel: ctx.BuildInfo.Commit, + VersionLabel: ctx.BuildInfo.Version, }).Inc() functionCallsDuration.With(prometheus.Labels{ FunctionLabel: ctx.CallInfo.FuncName, @@ -55,13 +58,19 @@ func Instrument(ctx *autometrics.Context, err *error) { TargetLatencyLabel: latencyTarget, TargetSuccessRateLabel: latencyObjective, SloNameLabel: sloName, + BuildTimeLabel: ctx.BuildInfo.BuildTime, + CommitLabel: ctx.BuildInfo.Commit, + VersionLabel: ctx.BuildInfo.Version, }).Observe(time.Since(ctx.StartTime).Seconds()) if ctx.TrackConcurrentCalls { functionCallsConcurrent.With(prometheus.Labels{ - FunctionLabel: ctx.CallInfo.FuncName, - ModuleLabel: ctx.CallInfo.ModuleName, - CallerLabel: callerLabel, + FunctionLabel: ctx.CallInfo.FuncName, + ModuleLabel: ctx.CallInfo.ModuleName, + CallerLabel: callerLabel, + BuildTimeLabel: ctx.BuildInfo.BuildTime, + CommitLabel: ctx.BuildInfo.Commit, + VersionLabel: ctx.BuildInfo.Version, }).Dec() } } @@ -70,8 +79,9 @@ func Instrument(ctx *autometrics.Context, err *error) { // // It is meant to be called as the first argument to Instrument in a // defer call. -func PreInstrument(ctx *autometrics.Context) *autometrics.Context { +func PreInstrument(ctx *autometrics.Context) *autometrics.Context { ctx.CallInfo = autometrics.CallerInfo() + ctx.FillBuildInfo() var callerLabel string if ctx.TrackCallerName { @@ -80,9 +90,12 @@ func PreInstrument(ctx *autometrics.Context) *autometrics.Context { if ctx.TrackConcurrentCalls { functionCallsConcurrent.With(prometheus.Labels{ - FunctionLabel: ctx.CallInfo.FuncName, - ModuleLabel: ctx.CallInfo.ModuleName, - CallerLabel: callerLabel, + FunctionLabel: ctx.CallInfo.FuncName, + ModuleLabel: ctx.CallInfo.ModuleName, + CallerLabel: callerLabel, + BuildTimeLabel: ctx.BuildInfo.BuildTime, + CommitLabel: ctx.BuildInfo.Commit, + VersionLabel: ctx.BuildInfo.Version, }).Inc() } diff --git a/pkg/autometrics/prometheus/prometheus.go b/pkg/autometrics/prometheus/prometheus.go index aae8ccc..f7514ff 100644 --- a/pkg/autometrics/prometheus/prometheus.go +++ b/pkg/autometrics/prometheus/prometheus.go @@ -9,6 +9,7 @@ var ( functionCallsCount *prometheus.CounterVec functionCallsDuration *prometheus.HistogramVec functionCallsConcurrent *prometheus.GaugeVec + buildInfo *prometheus.GaugeVec DefBuckets = autometrics.DefBuckets ) @@ -19,6 +20,8 @@ const ( FunctionCallsDurationName = "function_calls_duration" // FunctionCallsConcurrentName is the name of the prometheus metric for the number of simulateneously active calls to specific functions. FunctionCallsConcurrentName = "function_calls_concurrent" + // BuildInfo is the name of the prometheus metric for the version of the monitored codebase. + BuildInfoName = "build_info" // FunctionLabel is the prometheus label that describes the function name. // @@ -47,10 +50,23 @@ const ( // In the case of success objectives, it describes the percentage of calls // that must be successful (i.e. have their [ResultLabel] be 'ok'). TargetSuccessRateLabel = "objective_percentile" - // SloLabelName is the prometheus label that describes the name of the Service Level Objective. + // SloLabel is the prometheus label that describes the name of the Service Level Objective. SloNameLabel = "objective_name" + + // CommitLabel is the prometheus label that describes the commit of the monitored codebase. + CommitLabel = "commit" + // VersionLabel is the prometheus label that describes the version of the monitored codebase. + VersionLabel = "version" + // BuildTimeLabel is the prometheus label that describes the timestamp of the build of the monitored codebase. + BuildTimeLabel = "build_time" ) +// BuildInfo holds meta information about the build of the instrumented code. +// +// This is a reexport of the autometrics type to allow [Init] to work with only +// the current (prometheus) package imported at the call site. +type BuildInfo = autometrics.BuildInfo + // Init sets up the metrics required for autometrics' decorated functions and registers // them to the argument registry. // @@ -60,29 +76,45 @@ const ( // Make sure that all the latency targets you want to use for SLOs are // present in the histogramBuckets array, otherwise the alerts will fail // to work (they will never trigger.) -func Init(reg *prometheus.Registry, histogramBuckets []float64) error { +func Init(reg *prometheus.Registry, histogramBuckets []float64, buildInformation BuildInfo) error { + autometrics.SetCommit(buildInformation.Commit) + autometrics.SetVersion(buildInformation.Version) + autometrics.SetBuildTime(buildInformation.BuildTime) + functionCallsCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: FunctionCallsCountName, - }, []string{FunctionLabel, ModuleLabel, CallerLabel, ResultLabel, TargetSuccessRateLabel, SloNameLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, ResultLabel, TargetSuccessRateLabel, SloNameLabel, CommitLabel, VersionLabel, BuildTimeLabel}) functionCallsDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: FunctionCallsDurationName, Buckets: histogramBuckets, - }, []string{FunctionLabel, ModuleLabel, CallerLabel, TargetLatencyLabel, TargetSuccessRateLabel, SloNameLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, TargetLatencyLabel, TargetSuccessRateLabel, SloNameLabel, CommitLabel, VersionLabel, BuildTimeLabel}) functionCallsConcurrent = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: FunctionCallsConcurrentName, - }, []string{FunctionLabel, ModuleLabel, CallerLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, CommitLabel, VersionLabel, BuildTimeLabel}) + + buildInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: BuildInfoName, + }, []string{CommitLabel, VersionLabel, BuildTimeLabel}) if reg != nil { reg.MustRegister(functionCallsCount) reg.MustRegister(functionCallsDuration) reg.MustRegister(functionCallsConcurrent) + reg.MustRegister(buildInfo) } else { prometheus.DefaultRegisterer.MustRegister(functionCallsCount) prometheus.DefaultRegisterer.MustRegister(functionCallsDuration) prometheus.DefaultRegisterer.MustRegister(functionCallsConcurrent) + prometheus.DefaultRegisterer.MustRegister(buildInfo) } + buildInfo.With(prometheus.Labels{ + CommitLabel: buildInformation.Commit, + VersionLabel: buildInformation.Version, + BuildTimeLabel: buildInformation.BuildTime, + }).Set(1) + return nil } From aced1d6467f88f5c41a2a6a436f36cb048abf69e Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Fri, 21 Apr 2023 11:34:04 +0200 Subject: [PATCH 02/10] Fix tests --- internal/generate/generate_test.go | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/internal/generate/generate_test.go b/internal/generate/generate_test.go index 927950d..42be489 100644 --- a/internal/generate/generate_test.go +++ b/internal/generate/generate_test.go @@ -62,12 +62,12 @@ func main() { "//\n" + "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + - "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29%29&g0.tab=0\n" + - "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22main%22%7D&g0.tab=0\n" + - "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0\n" + + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"Service Test\" --success-target 99\n" + "func main() {\n" + @@ -148,12 +148,12 @@ func main() { "//\n" + "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + - "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29%29%29&g0.tab=0\n" + - "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22main%22%7D&g0.tab=0\n" + - "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0\n" + + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"API\" --latency-target 99.9 --latency-ms 500\n" + "func main() {\n" + @@ -626,8 +626,8 @@ func implementContextCodeGenTest(t *testing.T, contextToSerialize autometrics.Co sourceContext := internal.GeneratorContext{ RuntimeCtx: contextToSerialize, FuncCtx: internal.GeneratorFunctionContext{ - CommentIndex: -1, - ImplImportName: "autometrics", + CommentIndex: -1, + ImplImportName: "autometrics", }, } From 2670345b936c7adb47ba6764c105d3514ee10a00 Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Mon, 24 Apr 2023 17:50:20 +0200 Subject: [PATCH 03/10] Reorganize projects to use build info in metrics --- .gitmodules | 3 + README.md | 11 +- {examples/otel => configs}/alertmanager.yml | 0 configs/autometrics.rules.yml | 1594 ----------------- configs/shared | 1 + ...docker-compose.open-telemetry-example.yaml | 12 +- ... => docker-compose.prometheus-example.yaml | 23 +- examples/otel/Dockerfile | 26 +- examples/otel/README.md | 5 + examples/otel/autometrics.rules.yml | 1594 ----------------- examples/otel/configs/alertmanager.yml | 1 + examples/otel/configs/autometrics.rules.yml | 1 + examples/otel/{ => configs}/prometheus.yaml | 0 examples/otel/load.Dockerfile | 8 + examples/otel/scripts/build_server | 11 + examples/{web => otel/scripts}/poll_server | 8 +- examples/web/Dockerfile | 26 +- examples/web/README.md | 33 +- examples/web/alertmanager.yml | 17 - examples/web/autometrics.rules.yml | 1 - examples/web/cmd/main.go | 34 +- examples/web/cmd/main.go.orig | 10 +- examples/web/configs/alertmanager.yml | 1 + examples/web/configs/autometrics.rules.yml | 1 + examples/web/{ => configs}/prometheus.yaml | 0 examples/web/load.Dockerfile | 8 + examples/web/scripts/build_server | 11 + examples/{otel => web/scripts}/poll_server | 8 +- internal/autometrics/doc.go | 3 +- internal/generate/generate_test.go | 8 +- 30 files changed, 182 insertions(+), 3277 deletions(-) create mode 100644 .gitmodules rename {examples/otel => configs}/alertmanager.yml (100%) delete mode 100644 configs/autometrics.rules.yml create mode 160000 configs/shared rename examples/web/docker-compose.yaml => docker-compose.open-telemetry-example.yaml (71%) rename examples/otel/docker-compose.yaml => docker-compose.prometheus-example.yaml (61%) delete mode 100644 examples/otel/autometrics.rules.yml create mode 120000 examples/otel/configs/alertmanager.yml create mode 120000 examples/otel/configs/autometrics.rules.yml rename examples/otel/{ => configs}/prometheus.yaml (100%) create mode 100644 examples/otel/load.Dockerfile create mode 100755 examples/otel/scripts/build_server rename examples/{web => otel/scripts}/poll_server (50%) delete mode 100644 examples/web/alertmanager.yml delete mode 120000 examples/web/autometrics.rules.yml create mode 120000 examples/web/configs/alertmanager.yml create mode 120000 examples/web/configs/autometrics.rules.yml rename examples/web/{ => configs}/prometheus.yaml (100%) create mode 100644 examples/web/load.Dockerfile create mode 100755 examples/web/scripts/build_server rename examples/{otel => web/scripts}/poll_server (50%) diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..7ab9faf --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "autometrics-shared"] + path = configs/shared + url = https://github.com/autometrics-dev/autometrics-shared.git diff --git a/README.md b/README.md index 9b5f823..a006b06 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,16 @@ trigger alerts directly from production usage: ![a Slack bot is posting an alert directly in the channel](./assets/slack-alert-example.png) A fully working use-case and example of library usage is available in the -[examples/web](./examples/web) subdirectory +[examples/web](./examples/web) subdirectory. You can build and run load on the +example server using: + +```console +git submodule update --init +docker compose -f docker-compose.prometheus-example.yaml up +``` + +And then explore the generated links by opening the [main +file](./examples/web/cmd/main.go). ## How to use diff --git a/examples/otel/alertmanager.yml b/configs/alertmanager.yml similarity index 100% rename from examples/otel/alertmanager.yml rename to configs/alertmanager.yml diff --git a/configs/autometrics.rules.yml b/configs/autometrics.rules.yml deleted file mode 100644 index 57d3038..0000000 --- a/configs/autometrics.rules.yml +++ /dev/null @@ -1,1594 +0,0 @@ - ---- -# Code generated by Sloth (v0.11.0): https://github.com/slok/sloth. -# DO NOT EDIT. - -groups: -- name: sloth-slo-sli-recordings-autometrics-success-rate-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", - sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-90 - rules: - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", - sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-95 - rules: - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", - sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99 - rules: - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99_9", - sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99_9 - rules: - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", - sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: latency-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-90 - rules: - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", - sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: latency-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-95 - rules: - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", - sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: latency-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99 - rules: - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", - sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99_9 - rules: - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. diff --git a/configs/shared b/configs/shared new file mode 160000 index 0000000..fd4aa1e --- /dev/null +++ b/configs/shared @@ -0,0 +1 @@ +Subproject commit fd4aa1e7fa3aaa7a736f778ee782e522df73b336 diff --git a/examples/web/docker-compose.yaml b/docker-compose.open-telemetry-example.yaml similarity index 71% rename from examples/web/docker-compose.yaml rename to docker-compose.open-telemetry-example.yaml index 335e10d..5582f85 100644 --- a/examples/web/docker-compose.yaml +++ b/docker-compose.open-telemetry-example.yaml @@ -20,8 +20,8 @@ services: container_name: alertmanager restart: unless-stopped volumes: - - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - - ./slack_url.txt:/etc/alertmanager/slack_url + - ./examples/otel/configs/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - ./examples/otel/configs/slack_url.txt:/etc/alertmanager/slack_url command: - '--config.file=/etc/alertmanager/alertmanager.yml' expose: @@ -34,8 +34,8 @@ services: container_name: prometheus restart: unless-stopped volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yaml - - ./autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml + - ./examples/otel/configs/prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./examples/otel/configs/autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml command: - '--config.file=/etc/prometheus/prometheus.yaml' expose: @@ -48,7 +48,9 @@ services: - web-server web-server: - build: . + build: + context: . + dockerfile: examples/otel/Dockerfile container_name: web-server restart: unless-stopped expose: diff --git a/examples/otel/docker-compose.yaml b/docker-compose.prometheus-example.yaml similarity index 61% rename from examples/otel/docker-compose.yaml rename to docker-compose.prometheus-example.yaml index 335e10d..aaa805d 100644 --- a/examples/otel/docker-compose.yaml +++ b/docker-compose.prometheus-example.yaml @@ -20,8 +20,8 @@ services: container_name: alertmanager restart: unless-stopped volumes: - - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - - ./slack_url.txt:/etc/alertmanager/slack_url + - ./examples/web/configs/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - ./examples/web/configs/slack_url.txt:/etc/alertmanager/slack_url command: - '--config.file=/etc/alertmanager/alertmanager.yml' expose: @@ -34,8 +34,8 @@ services: container_name: prometheus restart: unless-stopped volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yaml - - ./autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml + - ./examples/web/configs/prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./examples/web/configs/autometrics.rules.yml:/etc/prometheus/autometrics.rules.yml command: - '--config.file=/etc/prometheus/prometheus.yaml' expose: @@ -48,10 +48,23 @@ services: - web-server web-server: - build: . + build: + context: . + dockerfile: examples/web/Dockerfile container_name: web-server restart: unless-stopped expose: - 62086 ports: - "62086:62086" + + load-server: + build: + context: . + dockerfile: examples/web/load.Dockerfile + environment: + TARGET_HOST: web-server + container_name: load-server + restart: unless-stopped + depends_on: + - web-server diff --git a/examples/otel/Dockerfile b/examples/otel/Dockerfile index 782f7a9..1a87b34 100644 --- a/examples/otel/Dockerfile +++ b/examples/otel/Dockerfile @@ -1,16 +1,24 @@ FROM golang:1.20-alpine MAINTAINER Fiberplane +ARG version=development -# Cannot really build the demo image from -# the examples subfolder because of -# relative imports shenanigans that go out of build context (i.e. upwards) -# -# Use -# GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -# -# To build the web-server app +RUN apk update && apk add git -COPY web-server / +WORKDIR /app + +COPY . ./ + +RUN go mod download + +WORKDIR /app/examples/web + +RUN go generate cmd/main.go + +ENV VERSION="$version" + +RUN scripts/build_server + +RUN cp web-server / EXPOSE 62086 diff --git a/examples/otel/README.md b/examples/otel/README.md index 3e29afd..8cb369f 100644 --- a/examples/otel/README.md +++ b/examples/otel/README.md @@ -11,3 +11,8 @@ You can notice the 3 differences that are mentionned in the top-level README: - The autometrics call in the Go generator has the `-otel` flag - The `amImpl.Init` call uses a different first argument, with the name of the OpenTelemetry scope to use + +## Quickstart + +You can build and run the example by using the +`docker-compose.open-telemetry-example.yaml` file at the root of the repo. diff --git a/examples/otel/autometrics.rules.yml b/examples/otel/autometrics.rules.yml deleted file mode 100644 index 57d3038..0000000 --- a/examples/otel/autometrics.rules.yml +++ /dev/null @@ -1,1594 +0,0 @@ - ---- -# Code generated by Sloth (v0.11.0): https://github.com/slok/sloth. -# DO NOT EDIT. - -groups: -- name: sloth-slo-sli-recordings-autometrics-success-rate-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"}[30d]) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", - sloth_slo="success-rate-90"} - labels: - sloth_id: autometrics-success-rate-90 - sloth_service: autometrics - sloth_slo: success-rate-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: success-rate-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-90 - rules: - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-90", sloth_service="autometrics", sloth_slo="success-rate-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"}[30d]) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", - sloth_slo="success-rate-95"} - labels: - sloth_id: autometrics-success-rate-95 - sloth_service: autometrics - sloth_slo: success-rate-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: success-rate-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-95 - rules: - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-95", sloth_service="autometrics", sloth_slo="success-rate-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"}[30d]) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", - sloth_slo="success-rate-99"} - labels: - sloth_id: autometrics-success-rate-99 - sloth_service: autometrics - sloth_slo: success-rate-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: success-rate-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99 - rules: - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99", sloth_service="autometrics", sloth_slo="success-rate-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[5m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[30m]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[2h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[6h]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[1d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9",result="error"}[3d]))) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"}[30d]) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-success-rate-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-success-rate-99_9", - sloth_service="autometrics", sloth_slo="success-rate-99_9"} - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_service: autometrics - sloth_slo: success-rate-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-success-rate-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: success-rate-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-success-rate-99_9 - rules: - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: page - sloth_severity: page - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Error Rate SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-success-rate-99_9", sloth_service="autometrics", sloth_slo="success-rate-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: success-rate - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High error rate on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-90 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="90"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="90"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"}[30d]) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-90 - rules: - - record: slo:objective:ratio - expr: vector(0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:error_budget:ratio - expr: vector(1-0.9) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-90", sloth_service="autometrics", - sloth_slo="latency-90"} - labels: - sloth_id: autometrics-latency-90 - sloth_service: autometrics - sloth_slo: latency-90 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-90 - sloth_mode: cli-gen-prom - sloth_objective: "90" - sloth_service: autometrics - sloth_slo: latency-90 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-90 - rules: - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (14.4 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (6 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 90% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (3 * 0.1)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-90", sloth_service="autometrics", sloth_slo="latency-90"} > (1 * 0.1)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-95 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="95"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="95"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"}[30d]) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-95 - rules: - - record: slo:objective:ratio - expr: vector(0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:error_budget:ratio - expr: vector(1-0.95) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-95", sloth_service="autometrics", - sloth_slo="latency-95"} - labels: - sloth_id: autometrics-latency-95 - sloth_service: autometrics - sloth_slo: latency-95 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-95 - sloth_mode: cli-gen-prom - sloth_objective: "95" - sloth_service: autometrics - sloth_slo: latency-95 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-95 - rules: - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (14.4 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (6 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 95% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (3 * 0.05)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-95", sloth_service="autometrics", sloth_slo="latency-95"} > (1 * 0.05)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"}[30d]) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99 - rules: - - record: slo:objective:ratio - expr: vector(0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:error_budget:ratio - expr: vector(1-0.99) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99", sloth_service="autometrics", - sloth_slo="latency-99"} - labels: - sloth_id: autometrics-latency-99 - sloth_service: autometrics - sloth_slo: latency-99 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99 - sloth_mode: cli-gen-prom - sloth_objective: "99" - sloth_service: autometrics - sloth_slo: latency-99 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99 - rules: - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (14.4 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (6 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (3 * 0.01)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99", sloth_service="autometrics", sloth_slo="latency-99"} > (1 * 0.01)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. -- name: sloth-slo-sli-recordings-autometrics-latency-99_9 - rules: - - record: slo:sli_error:ratio_rate5m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[5m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[5m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 5m - - record: slo:sli_error:ratio_rate30m - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[30m]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[30m])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30m - - record: slo:sli_error:ratio_rate1h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1h - - record: slo:sli_error:ratio_rate2h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[2h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[2h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 2h - - record: slo:sli_error:ratio_rate6h - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[6h]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[6h])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 6h - - record: slo:sli_error:ratio_rate1d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[1d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[1d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 1d - - record: slo:sli_error:ratio_rate3d - expr: | - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) - (sum by (objective_name, objective_percentile) ( - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "objective_latency_threshold") - and - label_join(rate(function_calls_duration_bucket{objective_percentile="99.9"}[3d]), "autometrics_check_label_equality", "", "le") - )) - ) - / - (sum by (objective_name, objective_percentile) (rate(function_calls_duration_count{objective_percentile="99.9"}[3d])) > 0) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 3d - - record: slo:sli_error:ratio_rate30d - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"}[30d]) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_window: 30d -- name: sloth-slo-meta-recordings-autometrics-latency-99_9 - rules: - - record: slo:objective:ratio - expr: vector(0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:error_budget:ratio - expr: vector(1-0.9990000000000001) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:time_period:days - expr: vector(30) - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:current_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_burn_rate:ratio - expr: | - slo:sli_error:ratio_rate30d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: slo:period_error_budget_remaining:ratio - expr: 1 - slo:period_burn_rate:ratio{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", - sloth_slo="latency-99_9"} - labels: - sloth_id: autometrics-latency-99_9 - sloth_service: autometrics - sloth_slo: latency-99_9 - - record: sloth_slo_info - expr: vector(1) - labels: - sloth_id: autometrics-latency-99_9 - sloth_mode: cli-gen-prom - sloth_objective: "99.9" - sloth_service: autometrics - sloth_slo: latency-99_9 - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 -- name: sloth-slo-alerts-autometrics-latency-99_9 - rules: - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: page - sloth_severity: page - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. - - alert: High Latency SLO - 99.9% - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="autometrics-latency-99_9", sloth_service="autometrics", sloth_slo="latency-99_9"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - category: latency - severity: ticket - sloth_severity: ticket - annotations: - summary: 'High latency on SLO: {{$labels.objective_name}}' - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error budget - burn rate is too fast. diff --git a/examples/otel/configs/alertmanager.yml b/examples/otel/configs/alertmanager.yml new file mode 120000 index 0000000..2b1bf0c --- /dev/null +++ b/examples/otel/configs/alertmanager.yml @@ -0,0 +1 @@ +../../../configs/alertmanager.yml \ No newline at end of file diff --git a/examples/otel/configs/autometrics.rules.yml b/examples/otel/configs/autometrics.rules.yml new file mode 120000 index 0000000..2048540 --- /dev/null +++ b/examples/otel/configs/autometrics.rules.yml @@ -0,0 +1 @@ +../../../configs/shared/autometrics.rules.yml \ No newline at end of file diff --git a/examples/otel/prometheus.yaml b/examples/otel/configs/prometheus.yaml similarity index 100% rename from examples/otel/prometheus.yaml rename to examples/otel/configs/prometheus.yaml diff --git a/examples/otel/load.Dockerfile b/examples/otel/load.Dockerfile new file mode 100644 index 0000000..b20451d --- /dev/null +++ b/examples/otel/load.Dockerfile @@ -0,0 +1,8 @@ +FROM golang:1.20-alpine +MAINTAINER Fiberplane + +RUN apk update && apk add curl + +COPY examples/web/scripts/poll_server / + +CMD [ "/poll_server" ] diff --git a/examples/otel/scripts/build_server b/examples/otel/scripts/build_server new file mode 100755 index 0000000..865bd73 --- /dev/null +++ b/examples/otel/scripts/build_server @@ -0,0 +1,11 @@ +#!/bin/sh + +set -euo pipefail + +VERSION=${VERSION:-development} +COMMIT=`git log -1 --format="%H"` +BUILD_TIME=`date -Iseconds` +SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" + + +go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.BuildTime=${BUILD_TIME}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go diff --git a/examples/web/poll_server b/examples/otel/scripts/poll_server similarity index 50% rename from examples/web/poll_server rename to examples/otel/scripts/poll_server index 2632d28..e395463 100755 --- a/examples/web/poll_server +++ b/examples/otel/scripts/poll_server @@ -2,13 +2,15 @@ set -euo pipefail +TARGET_HOST="${TARGET_HOST:-localhost}" + while true do if [ "$(($RANDOM % 2))" == "0" ]; then - curl http://localhost:62086/random-error + curl "http://${TARGET_HOST}:62086/random-error" fi if [ "$(($RANDOM % 4))" == "0" ]; then - curl http://localhost:62086/ + curl "http://${TARGET_HOST}:62086/" fi - sleep 1 + sleep 0.2 done diff --git a/examples/web/Dockerfile b/examples/web/Dockerfile index 782f7a9..1a87b34 100644 --- a/examples/web/Dockerfile +++ b/examples/web/Dockerfile @@ -1,16 +1,24 @@ FROM golang:1.20-alpine MAINTAINER Fiberplane +ARG version=development -# Cannot really build the demo image from -# the examples subfolder because of -# relative imports shenanigans that go out of build context (i.e. upwards) -# -# Use -# GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -# -# To build the web-server app +RUN apk update && apk add git -COPY web-server / +WORKDIR /app + +COPY . ./ + +RUN go mod download + +WORKDIR /app/examples/web + +RUN go generate cmd/main.go + +ENV VERSION="$version" + +RUN scripts/build_server + +RUN cp web-server / EXPOSE 62086 diff --git a/examples/web/README.md b/examples/web/README.md index c52d733..9cdb9ca 100644 --- a/examples/web/README.md +++ b/examples/web/README.md @@ -8,9 +8,12 @@ It shows the generator usage and sets up Prometheus to showcase the ## Quick start ``` sh -GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -docker compose up -d -./poll_server +# Go to the root of the repo +cd ../.. +# Build all the images +docker compose -f docker-compose.prometheus-example.yaml build +# Run all the services +docker compose -f docker-compose.prometheus-example.yaml up ``` Then open [main](./cmd/main.go) in your editor and interact with the documentation links! @@ -21,7 +24,8 @@ Then open [main](./cmd/main.go) in your editor and interact with the documentati Optionnally, create a slack integration with an "incoming webhook" for one of your channels, and put the URL of the webhook (a secret!) in `slack_url.txt` in -the directory. That will enable alerting in Slack directly through Alertmanager. +the [configs](./configs) directory. That will enable alerting in Slack directly +through Alertmanager. You can see that the name of the service "API" comes directly from the annotation in the code. @@ -36,7 +40,6 @@ You can even monitor all the alerts triggering through Prometheus or Alertmanage In order to run this example you need: -- Go (at least 1.18) - Docker - Docker Compose @@ -66,20 +69,24 @@ The generator is idempotent. ### Building the docker image -Build the web-server for the image architecture: +Build the web-server in an image. There are 2 important things in the +image [recipe](./Dockerfile): +- The context of the image is the root of the repository, only so that this + example runs the `development` version of the code, and +- There is a specific [build script](./scripts/build_server) that uses + Go linker flags to inject build and version information in the binary. ```sh -GOOS=linux GOARCH=amd64 go build -o web-server ./cmd/main.go -docker compose build +docker build --build-arg VERSION=1.0.0 -t web-server . ``` ### Start the services -In one terminal you can launch the stack and the small helper script to poll the the server: +In one terminal you can launch the image and the small helper script to poll the the server: ```sh -docker compose up -d -./poll_server +cd ../.. +docker compose -f docker-compose.prometheus-example.yaml up ``` ### Check the links on Prometheus @@ -104,9 +111,9 @@ configuration to the correct notification service: ![Alertmanager alerts dashboard showing the alerts firing](../../assets/alertmanager-alert-example.png) -This demo example has a [minimal configuration](./alertmanager.yml) for alerts +This demo example has a [minimal configuration](./configs/alertmanager.yml) for alerts that expects a file `slack_url.txt` to be passed in docker-compose context. -Create the file in the same folder as this README, and if the file exists, the +Create the file in the [configs folder](./configs), and if the file exists, the triggered alerts automatically go on Slack to the configured channel: ![a Slack bot is posting an alert directly in the channel](../../assets/slack-alert-example.png) diff --git a/examples/web/alertmanager.yml b/examples/web/alertmanager.yml deleted file mode 100644 index 92d01ab..0000000 --- a/examples/web/alertmanager.yml +++ /dev/null @@ -1,17 +0,0 @@ -global: - # Also possible to use the URL directly - # Ex: `slack_api_url: 'https://slack.com/...'` - slack_api_url_file: '/etc/alertmanager/slack_url' - -route: - receiver: 'slack-notifications' - group_by: [sloth_service, sloth_slo, objective_name] - -receivers: -- name: 'slack-notifications' - slack_configs: - # Channel is ignored when using a webhook. The webhook URL encodes the - # channel the alerts will be posted to. - - channel: '#alerts' - title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" - text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}" diff --git a/examples/web/autometrics.rules.yml b/examples/web/autometrics.rules.yml deleted file mode 120000 index 9f80a00..0000000 --- a/examples/web/autometrics.rules.yml +++ /dev/null @@ -1 +0,0 @@ -../../configs/autometrics.rules.yml \ No newline at end of file diff --git a/examples/web/cmd/main.go b/examples/web/cmd/main.go index e9260ca..7a5a7c3 100644 --- a/examples/web/cmd/main.go +++ b/examples/web/cmd/main.go @@ -16,6 +16,10 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go +var Version = "development" +var Commit = "n/a" +var BuildTime string + func main() { rand.Seed(time.Now().UnixNano()) @@ -26,9 +30,9 @@ func main() { nil, amImpl.DefBuckets, amImpl.BuildInfo{ - Version: "0.4.0", - Commit: "anySHA", - BuildTime: "", + Version: Version, + Commit: Commit, + BuildTime: BuildTime, }, ) @@ -62,12 +66,12 @@ func main() { // // autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 // //autometrics:doc --slo "API" --latency-target 99 --latency-ms 250 func indexHandler(w http.ResponseWriter, _ *http.Request) error { @@ -108,12 +112,12 @@ var handlerError = errors.New("failed to handle request") // // autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 // //autometrics:doc --slo "API" --success-target 90 func randomErrorHandler(w http.ResponseWriter, _ *http.Request) (err error) { diff --git a/examples/web/cmd/main.go.orig b/examples/web/cmd/main.go.orig index 2fe7a85..86057f7 100644 --- a/examples/web/cmd/main.go.orig +++ b/examples/web/cmd/main.go.orig @@ -16,6 +16,10 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go +var Version = "development" +var Commit = "n/a" +var BuildTime string + func main() { rand.Seed(time.Now().UnixNano()) @@ -26,9 +30,9 @@ func main() { nil, amImpl.DefBuckets, amImpl.BuildInfo{ - Version: "0.4.0", - Commit: "anySHA", - BuildTime: "", + Version: Version, + Commit: Commit, + BuildTime: BuildTime, }, ) diff --git a/examples/web/configs/alertmanager.yml b/examples/web/configs/alertmanager.yml new file mode 120000 index 0000000..2b1bf0c --- /dev/null +++ b/examples/web/configs/alertmanager.yml @@ -0,0 +1 @@ +../../../configs/alertmanager.yml \ No newline at end of file diff --git a/examples/web/configs/autometrics.rules.yml b/examples/web/configs/autometrics.rules.yml new file mode 120000 index 0000000..2048540 --- /dev/null +++ b/examples/web/configs/autometrics.rules.yml @@ -0,0 +1 @@ +../../../configs/shared/autometrics.rules.yml \ No newline at end of file diff --git a/examples/web/prometheus.yaml b/examples/web/configs/prometheus.yaml similarity index 100% rename from examples/web/prometheus.yaml rename to examples/web/configs/prometheus.yaml diff --git a/examples/web/load.Dockerfile b/examples/web/load.Dockerfile new file mode 100644 index 0000000..b20451d --- /dev/null +++ b/examples/web/load.Dockerfile @@ -0,0 +1,8 @@ +FROM golang:1.20-alpine +MAINTAINER Fiberplane + +RUN apk update && apk add curl + +COPY examples/web/scripts/poll_server / + +CMD [ "/poll_server" ] diff --git a/examples/web/scripts/build_server b/examples/web/scripts/build_server new file mode 100755 index 0000000..865bd73 --- /dev/null +++ b/examples/web/scripts/build_server @@ -0,0 +1,11 @@ +#!/bin/sh + +set -euo pipefail + +VERSION=${VERSION:-development} +COMMIT=`git log -1 --format="%H"` +BUILD_TIME=`date -Iseconds` +SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" + + +go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.BuildTime=${BUILD_TIME}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go diff --git a/examples/otel/poll_server b/examples/web/scripts/poll_server similarity index 50% rename from examples/otel/poll_server rename to examples/web/scripts/poll_server index 2632d28..e395463 100755 --- a/examples/otel/poll_server +++ b/examples/web/scripts/poll_server @@ -2,13 +2,15 @@ set -euo pipefail +TARGET_HOST="${TARGET_HOST:-localhost}" + while true do if [ "$(($RANDOM % 2))" == "0" ]; then - curl http://localhost:62086/random-error + curl "http://${TARGET_HOST}:62086/random-error" fi if [ "$(($RANDOM % 4))" == "0" ]; then - curl http://localhost:62086/ + curl "http://${TARGET_HOST}:62086/" fi - sleep 1 + sleep 0.2 done diff --git a/internal/autometrics/doc.go b/internal/autometrics/doc.go index 6efa99c..7c963e5 100644 --- a/internal/autometrics/doc.go +++ b/internal/autometrics/doc.go @@ -68,7 +68,7 @@ func requestRateQuery(counterName, labelKey, labelValue string) string { } func errorRatioQuery(counterName, labelKey, labelValue string) string { - return fmt.Sprintf("sum by (%s, %s, %s, %s) (rate(%s{%s=\"%s\",%s=\"error\"}[5m]) %s)", + return fmt.Sprintf("(sum by (%s, %s, %s, %s) (rate(%s{%s=\"%s\",%s=\"error\"}[5m]) %s)) / (%s)", prometheus.FunctionLabel, prometheus.ModuleLabel, prometheus.VersionLabel, @@ -78,6 +78,7 @@ func errorRatioQuery(counterName, labelKey, labelValue string) string { labelValue, prometheus.ResultLabel, addBuildInfoLabels(), + requestRateQuery(counterName, labelKey, labelValue), ) } diff --git a/internal/generate/generate_test.go b/internal/generate/generate_test.go index 42be489..bf3bbca 100644 --- a/internal/generate/generate_test.go +++ b/internal/generate/generate_test.go @@ -63,11 +63,11 @@ func main() { "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"Service Test\" --success-target 99\n" + "func main() {\n" + @@ -149,11 +149,11 @@ func main() { "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"API\" --latency-target 99.9 --latency-ms 500\n" + "func main() {\n" + From 4711792a6115bdcbd285672ee546308baf14e5f6 Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Tue, 25 Apr 2023 16:34:34 +0200 Subject: [PATCH 04/10] Go back on autogenerated rules file --- configs/shared | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/shared b/configs/shared index fd4aa1e..09d5384 160000 --- a/configs/shared +++ b/configs/shared @@ -1 +1 @@ -Subproject commit fd4aa1e7fa3aaa7a736f778ee782e522df73b336 +Subproject commit 09d538449cef6a9af35900b4d91213f4f681d566 From 19ecd34cde43c6d79dd09545149b3170b1604d04 Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Wed, 26 Apr 2023 08:51:35 +0200 Subject: [PATCH 05/10] Add -help and -version flags to the binary Closes: #36 --- .github/workflows/release.yml | 2 +- cmd/autometrics/main.go | 70 +++++++++++++++++++ .../{doc.go => prometheus_link_gen.go} | 0 internal/build/build.go | 7 ++ scripts/build_generator | 10 +++ 5 files changed, 88 insertions(+), 1 deletion(-) rename internal/autometrics/{doc.go => prometheus_link_gen.go} (100%) create mode 100644 internal/build/build.go create mode 100755 scripts/build_generator diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a8267c2..1854d67 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -37,7 +37,7 @@ jobs: check-latest: true - name: Build run: | - GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} go build cmd/autometrics/main.go + GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} ./scripts/build_generator mv main${{ matrix.ext }} autometrics${{ matrix.ext }} - name: Pack (Zip) diff --git a/cmd/autometrics/main.go b/cmd/autometrics/main.go index 75c6af8..2d76b8a 100644 --- a/cmd/autometrics/main.go +++ b/cmd/autometrics/main.go @@ -1,10 +1,12 @@ package main import ( + "fmt" "log" "os" internal "github.com/autometrics-dev/autometrics-go/internal/autometrics" + "github.com/autometrics-dev/autometrics-go/internal/build" "github.com/autometrics-dev/autometrics-go/internal/generate" "github.com/autometrics-dev/autometrics-go/pkg/autometrics" ) @@ -13,6 +15,10 @@ const ( prometheusAddressEnvironmentVariable = "AM_PROMETHEUS_URL" useOtelFlag = "-otel" allowCustomLatencies = "-custom-latency" + shortVersionFlag = "-v" + longVersionFlag = "-version" + shortHelpFlag = "-h" + longHelpFlag = "-help" DefaultPrometheusInstanceUrl = "http://localhost:9090/" ) @@ -21,6 +27,21 @@ func main() { moduleName := os.Getenv("GOPACKAGE") args := os.Args + if contains(args, longVersionFlag) || contains(args, shortVersionFlag) { + printVersion() + os.Exit(0) + } + + if contains(args, longHelpFlag) || contains(args, shortHelpFlag) { + printHelp() + os.Exit(0) + } + + if fileName == "" { + printHelp() + os.Exit(1) + } + prometheusUrl, envVarExists := os.LookupEnv(prometheusAddressEnvironmentVariable) if !envVarExists { prometheusUrl = DefaultPrometheusInstanceUrl @@ -49,3 +70,52 @@ func contains[T comparable](s []T, e T) bool { } return false } + +func printVersion() { + fmt.Printf("%s\n", build.Version) + if build.Time != "" { + fmt.Printf("Built on %s\n", build.Time) + } +} + +func printHelp() { + fmt.Printf("Autometrics %s", build.Version) + if build.Time != "" { + fmt.Printf(" (%s)", build.Time) + } + fmt.Printf("\nBuilt by Autometrics team -- https://autometrics.dev\n\n") + + fmt.Printf( + "usage: %s [%s | %s] [%s | %s] [%s] [%s] \n\n", + os.Args[0], + shortVersionFlag, + longVersionFlag, + shortHelpFlag, + longHelpFlag, + useOtelFlag, + allowCustomLatencies, + ) + + fmt.Println("Autometrics is meant to be used in a Go generator context. As such, it takes mandatory arguments in the form of environment variables:") + fmt.Println(" GOFILE\tPath to the file to transform.") + fmt.Println(" GOPACKAGE\tName to the containing package.") + fmt.Printf("\n\n") + + fmt.Println("Autometrics generates links to a prometheus instance in the doc comments of instrumented functions. You can control the base URL of the pointed to instance with an environment variable:") + fmt.Printf(" %s\tBase URL of the Prometheus instance to generate links to (default: %s)\n", + prometheusAddressEnvironmentVariable, + DefaultPrometheusInstanceUrl, + ) + fmt.Printf("\n\n") + + fmt.Printf(" %s\tUse OpenTelemetry client library to instrument code instead of default Prometheus\n", + useOtelFlag, + ) + fmt.Printf(" %s\tAllow non-default latencies to be used in latency-based SLOs (the default values in seconds are %v)\n", + allowCustomLatencies, + autometrics.DefBuckets, + ) + fmt.Printf("\n") + + fmt.Println("Check https://github.com/autometrics-dev/autometrics-go for more help (including examples) and usage information.") +} diff --git a/internal/autometrics/doc.go b/internal/autometrics/prometheus_link_gen.go similarity index 100% rename from internal/autometrics/doc.go rename to internal/autometrics/prometheus_link_gen.go diff --git a/internal/build/build.go b/internal/build/build.go new file mode 100644 index 0000000..3e3cd57 --- /dev/null +++ b/internal/build/build.go @@ -0,0 +1,7 @@ +package build // import "github.com/autometrics-dev/autometrics-go/internal/build" + +// Time is the timestamp of the build, when made available through ldflags. +var Time string + +// Version is the version string of the build, when made available through ldflags. +var Version = "development" diff --git a/scripts/build_generator b/scripts/build_generator new file mode 100755 index 0000000..e6dceaf --- /dev/null +++ b/scripts/build_generator @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -euo pipefail + +BUILD_PACK="github.com/autometrics-dev/autometrics-go/internal/build" +VERSION=`git describe --tags` +BUILD_TIME=`date -u` +SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" + +go build -v -ldflags="-X '${BUILD_PACK}.Version=${VERSION}' -X '${BUILD_PACK}.Time=${BUILD_TIME}'" ${SCRIPT_DIR}/../cmd/autometrics/main.go From 323548fbff52b4cf8fa7cdfd5600dd5add2debfe Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Wed, 26 Apr 2023 09:06:57 +0200 Subject: [PATCH 06/10] Fetch all tags in CI for binary builds --- .github/workflows/release.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1854d67..04276e4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -30,6 +30,9 @@ jobs: ext: '' steps: - uses: actions/checkout@v3 + with: + # We need all tags + fetch-depth: 0 - name: Set up Go uses: actions/setup-go@v4 with: From 17e105cc83a70d2fe7bd73ab9d08ae9fef57a01e Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Fri, 28 Apr 2023 12:36:01 +0200 Subject: [PATCH 07/10] !Breaking: Use go-arg to parse generator arguments Breaking change: the long flags in autometrics now take the long form arguments starting with -- Also removes the unused `am-alertsgen` binary. Even though for the time being we are using sloth to generate the alerts, at some point we might want to change, and adding extra code/friction to do the change seems unwarranted for now. --- README.md | 8 +-- cmd/am-alertsgen/main.go | 81 ----------------------- cmd/autometrics/main.go | 136 +++++++++++++------------------------- examples/otel/README.md | 2 +- examples/otel/cmd/main.go | 2 +- go.mod | 3 +- go.sum | 7 +- 7 files changed, 59 insertions(+), 180 deletions(-) delete mode 100644 cmd/am-alertsgen/main.go diff --git a/README.md b/README.md index a006b06..fb7b403 100644 --- a/README.md +++ b/README.md @@ -79,11 +79,11 @@ have the `--latency-ms` values to match the values given in your buckets. The values in the buckets are given in _seconds_. By default, the generator will error and tell you the valid default values if they don't match. If the default values do not match your use case, you can change the buckets in -the init call, and add a `-custom-latency` argument to the `//go:generate` invocation. +the init call, and add a `--custom-latency` argument to the `//go:generate` invocation. ```patch -//go:generate autometrics -+//go:generate autometrics -custom-latency ++//go:generate autometrics --custom-latency ``` ### Add cookies in your code @@ -228,11 +228,11 @@ metric. You can use the name of the application or its version for example ) ``` -- add the `-otel` flag to the `//go:generate` directive +- add the `--otel` flag to the `//go:generate` directive ```patch -//go:generate autometrics -+//go:generate autometrics -otel ++//go:generate autometrics --otel ``` ## (OPTIONAL) Git hook diff --git a/cmd/am-alertsgen/main.go b/cmd/am-alertsgen/main.go deleted file mode 100644 index e6da4cb..0000000 --- a/cmd/am-alertsgen/main.go +++ /dev/null @@ -1,81 +0,0 @@ -package main - -import ( - "log" - - _ "github.com/slok/sloth/pkg/prometheus/api/v1" -) - -func main() { - // TODO Replicate these rules from autometrics-rs/autometrics-cli/src/sloth.rs - // - // With default values - // #[clap(long, default_values = &["90", "95", "99", "99.9"])] - // objectives: Vec, - /* - fn generate_success_rate_slo(objective: &Decimal) -> String { - let objective_fraction = (objective / Decimal::from(100)).normalize(); - let objective_no_decimal = objective.to_string().replace(".", ""); - - format!(" - name: success-rate-{objective_no_decimal} - objective: {objective} - description: Common SLO based on function success rates - sli: - events: - error_query: sum by (slo_name, objective) (rate(function_calls_count{{objective=\"{objective_fraction}\",result=\"error\"}}[{{{{.window}}}}])) - total_query: sum by (slo_name, objective) (rate(function_calls_count{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}])) - alerting: - name: High Error Rate SLO - {objective}% - labels: - category: success-rate - annotations: - summary: \"High error rate on SLO: {{{{$labels.slo_name}}}}\" - page_alert: - labels: - severity: page - ticket_alert: - labels: - severity: ticket -") -} - -fn generate_latency_slo(objective: &Decimal) -> String { - let objective_fraction = (objective / Decimal::from(100)).normalize(); - let objective_no_decimal = objective.to_string().replace(".", ""); - - format!(" - name: latency-{objective_no_decimal} - objective: {objective} - description: Common SLO based on function latency - sli: - events: - error_query: > - sum by (slo_name, objective) (rate(function_calls_duration_bucket{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}])) - - - (sum by (slo_name, objective) ( - label_join(rate(function_calls_duration_bucket{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}]), \"autometrics_check_label_equality\", \"\", \"target_latency\") - and - label_join(rate(function_calls_duration_bucket{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}]), \"autometrics_check_label_equality\", \"\", \"le\") - )) - total_query: sum by (slo_name, objective) (rate(function_calls_duration_bucket{{objective=\"{objective_fraction}\"}}[{{{{.window}}}}])) - alerting: - name: High Latency SLO - {objective}% - labels: - category: latency - annotations: - summary: \"High latency on SLO: {{{{$labels.slo_name}}}}\" - page_alert: - labels: - severity: page - ticket_alert: - labels: - severity: ticket -") -} - */ - - - // TODO: Once the sloth rules have been made, we should be able to call - // the "binary" part of the sloth dep to generate the prom rules directly. - - log.Fatalf("unimplemented") -} diff --git a/cmd/autometrics/main.go b/cmd/autometrics/main.go index 2d76b8a..9820db6 100644 --- a/cmd/autometrics/main.go +++ b/cmd/autometrics/main.go @@ -3,119 +3,75 @@ package main import ( "fmt" "log" - "os" + "strings" internal "github.com/autometrics-dev/autometrics-go/internal/autometrics" "github.com/autometrics-dev/autometrics-go/internal/build" "github.com/autometrics-dev/autometrics-go/internal/generate" "github.com/autometrics-dev/autometrics-go/pkg/autometrics" + + arg "github.com/alexflint/go-arg" ) const ( - prometheusAddressEnvironmentVariable = "AM_PROMETHEUS_URL" - useOtelFlag = "-otel" - allowCustomLatencies = "-custom-latency" - shortVersionFlag = "-v" - longVersionFlag = "-version" - shortHelpFlag = "-h" - longHelpFlag = "-help" - DefaultPrometheusInstanceUrl = "http://localhost:9090/" + DefaultPrometheusInstanceUrl = "http://localhost:9090/" ) -func main() { - fileName := os.Getenv("GOFILE") - moduleName := os.Getenv("GOPACKAGE") - args := os.Args +type args struct { + FileName string `arg:"-f,--,required,env:GOFILE" placeholder:"FILE_NAME" help:"File to transform."` + ModuleName string `arg:"-m,--,required,env:GOPACKAGE" placeholder:"MODULE_NAME" help:"Module containing the file to transform."` + PrometheusUrl string `arg:"--prom_url,env:AM_PROMETHEUS_URL" placeholder:"PROMETHEUS_URL" default:"http://localhost:9090" help:"Base URL of the Prometheus instance to generate links to."` + UseOtel bool `arg:"--otel" default:"false" help:"Use OpenTelemetry client library to instrument code instead of default Prometheus."` + AllowCustomLatencies bool `arg:"--custom-latency" default:"false" help:"Allow non-default latencies to be used in latency-based SLOs."` +} - if contains(args, longVersionFlag) || contains(args, shortVersionFlag) { - printVersion() - os.Exit(0) - } +func (args) Version() string { + var buf strings.Builder - if contains(args, longHelpFlag) || contains(args, shortHelpFlag) { - printHelp() - os.Exit(0) + fmt.Fprintf(&buf, "Autometrics %s", build.Version) + if build.Time != "" { + fmt.Fprintf(&buf, " (Built on %s)", build.Time) } - if fileName == "" { - printHelp() - os.Exit(1) - } + return buf.String() +} - prometheusUrl, envVarExists := os.LookupEnv(prometheusAddressEnvironmentVariable) - if !envVarExists { - prometheusUrl = DefaultPrometheusInstanceUrl - } +func (args) Description() string { + var buf strings.Builder - implementation := autometrics.PROMETHEUS - if contains(args, useOtelFlag) { - implementation = autometrics.OTEL - } + fmt.Fprintf(&buf, + "Autometrics instruments annotated functions, and adds links in their doc comments to graphs of their live usage.\n\n") - ctx, err := internal.NewGeneratorContext(implementation, prometheusUrl, contains(args, allowCustomLatencies)) - if err != nil { - log.Fatalf("error initialising autometrics context: %s", err) - } + fmt.Fprintf(&buf, + "It is meant to be used in a Go generator context. As such, it takes mandatory arguments in the form of environment variables.\n"+ + "You can also control the base URL of the prometheus instance in doc comments with an environment variable.\n") + fmt.Fprintf(&buf, + "\tNote: If you do not use the custom latencies in the SLO, the allowed latencies (in seconds) are %v\n\n", + autometrics.DefBuckets) - if err := generate.TransformFile(ctx, fileName, moduleName); err != nil { - log.Fatalf("error transforming %s: %s", fileName, err) - } + fmt.Fprintln(&buf, + "Check https://github.com/autometrics-dev/autometrics-go for more help (including examples) and information.") + fmt.Fprintf(&buf, + "Autometrics is built by Fiberplane -- https://autometrics.dev\n") + + return buf.String() } -func contains[T comparable](s []T, e T) bool { - for _, v := range s { - if v == e { - return true - } +func main() { + var args args + arg.MustParse(&args) + + implementation := autometrics.PROMETHEUS + if args.UseOtel { + implementation = autometrics.OTEL } - return false -} -func printVersion() { - fmt.Printf("%s\n", build.Version) - if build.Time != "" { - fmt.Printf("Built on %s\n", build.Time) + ctx, err := internal.NewGeneratorContext(implementation, args.PrometheusUrl, args.AllowCustomLatencies) + if err != nil { + log.Fatalf("error initialising autometrics context: %s", err) } -} -func printHelp() { - fmt.Printf("Autometrics %s", build.Version) - if build.Time != "" { - fmt.Printf(" (%s)", build.Time) + if err := generate.TransformFile(ctx, args.FileName, args.ModuleName); err != nil { + log.Fatalf("error transforming %s: %s", args.FileName, err) } - fmt.Printf("\nBuilt by Autometrics team -- https://autometrics.dev\n\n") - - fmt.Printf( - "usage: %s [%s | %s] [%s | %s] [%s] [%s] \n\n", - os.Args[0], - shortVersionFlag, - longVersionFlag, - shortHelpFlag, - longHelpFlag, - useOtelFlag, - allowCustomLatencies, - ) - - fmt.Println("Autometrics is meant to be used in a Go generator context. As such, it takes mandatory arguments in the form of environment variables:") - fmt.Println(" GOFILE\tPath to the file to transform.") - fmt.Println(" GOPACKAGE\tName to the containing package.") - fmt.Printf("\n\n") - - fmt.Println("Autometrics generates links to a prometheus instance in the doc comments of instrumented functions. You can control the base URL of the pointed to instance with an environment variable:") - fmt.Printf(" %s\tBase URL of the Prometheus instance to generate links to (default: %s)\n", - prometheusAddressEnvironmentVariable, - DefaultPrometheusInstanceUrl, - ) - fmt.Printf("\n\n") - - fmt.Printf(" %s\tUse OpenTelemetry client library to instrument code instead of default Prometheus\n", - useOtelFlag, - ) - fmt.Printf(" %s\tAllow non-default latencies to be used in latency-based SLOs (the default values in seconds are %v)\n", - allowCustomLatencies, - autometrics.DefBuckets, - ) - fmt.Printf("\n") - - fmt.Println("Check https://github.com/autometrics-dev/autometrics-go for more help (including examples) and usage information.") } diff --git a/examples/otel/README.md b/examples/otel/README.md index 8cb369f..8808c5a 100644 --- a/examples/otel/README.md +++ b/examples/otel/README.md @@ -8,7 +8,7 @@ with a Prometheus exporter instead of using a Prometheus only client crate. You can notice the 3 differences that are mentionned in the top-level README: - The amImpl import has been changed to `otel` -- The autometrics call in the Go generator has the `-otel` flag +- The autometrics call in the Go generator has the `--otel` flag - The `amImpl.Init` call uses a different first argument, with the name of the OpenTelemetry scope to use diff --git a/examples/otel/cmd/main.go b/examples/otel/cmd/main.go index e171565..29a1cbc 100644 --- a/examples/otel/cmd/main.go +++ b/examples/otel/cmd/main.go @@ -14,7 +14,7 @@ import ( // This should be `//go:generate autometrics` in practice. Those are hacks to get the example working, see // README -//go:generate go run ../../../cmd/autometrics/main.go -otel +//go:generate go run ../../../cmd/autometrics/main.go --otel func main() { rand.Seed(time.Now().UnixNano()) diff --git a/go.mod b/go.mod index 17ea8f2..1469e67 100644 --- a/go.mod +++ b/go.mod @@ -5,8 +5,8 @@ go 1.18 require github.com/prometheus/client_golang v1.14.0 require ( + github.com/alexflint/go-arg v1.4.3 github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 - github.com/slok/sloth v0.11.0 go.opentelemetry.io/otel v1.14.0 go.opentelemetry.io/otel/exporters/prometheus v0.37.0 go.opentelemetry.io/otel/metric v0.37.0 @@ -16,6 +16,7 @@ require ( ) require ( + github.com/alexflint/go-scalar v1.1.0 // indirect github.com/go-logr/logr v1.2.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect go.opentelemetry.io/otel/trace v1.14.0 // indirect diff --git a/go.sum b/go.sum index 8b1ea40..6b9c1da 100644 --- a/go.sum +++ b/go.sum @@ -38,6 +38,10 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= +github.com/alexflint/go-arg v1.4.3 h1:9rwwEBpMXfKQKceuZfYcwuc/7YY7tWJbFsgG5cAU/uo= +github.com/alexflint/go-arg v1.4.3/go.mod h1:3PZ/wp/8HuqRZMUUgu7I+e1qcpUbvmS258mRXkFH4IA= +github.com/alexflint/go-scalar v1.1.0 h1:aaAouLLzI9TChcPXotr6gUhq+Scr8rl0P9P4PnltbhM= +github.com/alexflint/go-scalar v1.1.0/go.mod h1:LoFvNMqS1CPrMVltza4LvnGKhaSpc3oyLEBUZVhhS2o= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -202,8 +206,6 @@ github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= -github.com/slok/sloth v0.11.0 h1:0N3975hhO8izJoHIiPMBKZWxk6lxamuTd45MxYsOk04= -github.com/slok/sloth v0.11.0/go.mod h1:xE9zMDVvMb5ylMhkacDtC02vmRhZHNuqe5ez93OiDms= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= @@ -211,6 +213,7 @@ github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpE github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= From 07b62e5ef550f968064b863fbbc67ea9658aa967 Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Thu, 4 May 2023 14:45:27 +0200 Subject: [PATCH 08/10] Review --- cmd/autometrics/doc.go | 32 ++++++++++---- cmd/autometrics/main.go | 3 -- examples/otel/cmd/main.go | 44 ++++++++++++-------- examples/otel/scripts/build_server | 4 +- examples/web/cmd/main.go | 46 ++++++++++++--------- examples/web/cmd/main.go.orig | 12 +++--- examples/web/scripts/build_server | 4 +- internal/autometrics/prometheus_link_gen.go | 9 +++- internal/build/build.go | 3 -- internal/generate/generate_test.go | 24 +++++------ pkg/autometrics/global_state.go | 17 ++++---- pkg/autometrics/main.go | 27 +++++++++--- pkg/autometrics/otel/instrument.go | 8 ++-- pkg/autometrics/otel/otel.go | 8 ++-- pkg/autometrics/prometheus/instrument.go | 8 ++-- pkg/autometrics/prometheus/prometheus.go | 16 +++---- scripts/build_generator | 3 +- 17 files changed, 158 insertions(+), 110 deletions(-) diff --git a/cmd/autometrics/doc.go b/cmd/autometrics/doc.go index 7e6e552..196e477 100644 --- a/cmd/autometrics/doc.go +++ b/cmd/autometrics/doc.go @@ -1,22 +1,36 @@ -// Autometrics runs as Go generator and updates a source file to add usage queries and metric collection to annotated functions. -// -// As a Go generator, it relies on the environment variables `GOFILE` and -// `GOPACKAGE` to find the target file to edit. +// Autometrics instruments annotated functions, and adds links in their doc comments to graphs of their live usage. // // By default, `autometrics` generates metric collection code for usage with the // [Prometheus client library]. If you want to use [OpenTelemetry metrics] -// instead (with a prometheus exporter for the metrics), pass the `-otel` flag +// instead (with a prometheus exporter for the metrics), pass the `--otel` flag // to the invocation. // // By default, when activating Service Level Objectives (SLOs) `autometrics` // does not allow to use latency targets that are outside the default latencies // defined in [autometrics.DefBuckets]. If you want to use custom latencies for -// your latency SLOs, pass the `-custom-latency` flag to the invocation. +// your latency SLOs, pass the `--custom-latency` flag to the invocation. +// +// It is meant to be used in a Go generator context. As such, it takes mandatory arguments in the form of environment variables. +// You can also control the base URL of the prometheus instance in doc comments with an environment variable. +// Note: If you do not use the custom latencies in the SLO, the allowed latencies (in seconds) are in [autometrics.DefBuckets]. +// +// Check https://github.com/autometrics-dev/autometrics-go for more help (including examples) and information. +// Autometrics is built by Fiberplane -- https://autometrics.dev +// +// Usage: autometrics -f FILE_NAME -m MODULE_NAME [--prom_url PROMETHEUS_URL] [--otel] [--custom-latency] // -// By default, the generated links in the documentation point to a Prometheus -// instance at http://localhost:9090. You can use the environment variable -// `AM_PROMETHEUS_URL` to change the base URL in the documentation links. +// Options: +// -f FILE_NAME File to transform. [env: GOFILE] +// -m MODULE_NAME Module containing the file to transform. [env: GOPACKAGE] +// --prom_url PROMETHEUS_URL +// Base URL of the Prometheus instance to generate links to. [default: http://localhost:9090, env: AM_PROMETHEUS_URL] +// --otel Use [OpenTelemetry client library] to instrument code instead of default [Prometheus client library]. [default: false] +// --custom-latency Allow non-default latencies to be used in latency-based SLOs. [default: false] +// --help, -h display this help and exit +// --version display version and exit // // [Prometheus client library]: https://github.com/prometheus/client_golang +// [OpenTelemetry client library]: https://github.com/open-telemetry/opentelemetry-go // [OpenTelemetry metrics]: https://opentelemetry.io/docs/instrumentation/go/ +// [autometrics.DefBuckets]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics#DefBuckets package main diff --git a/cmd/autometrics/main.go b/cmd/autometrics/main.go index 9820db6..1fac70f 100644 --- a/cmd/autometrics/main.go +++ b/cmd/autometrics/main.go @@ -29,9 +29,6 @@ func (args) Version() string { var buf strings.Builder fmt.Fprintf(&buf, "Autometrics %s", build.Version) - if build.Time != "" { - fmt.Fprintf(&buf, " (Built on %s)", build.Time) - } return buf.String() } diff --git a/examples/otel/cmd/main.go b/examples/otel/cmd/main.go index 29a1cbc..bbe1515 100644 --- a/examples/otel/cmd/main.go +++ b/examples/otel/cmd/main.go @@ -16,6 +16,12 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go --otel +var ( + Version = "development" + Commit = "n/a" + Branch string +) + func main() { rand.Seed(time.Now().UnixNano()) @@ -26,9 +32,9 @@ func main() { "web-server", amImpl.DefBuckets, amImpl.BuildInfo{ - Version: "0.4.0", - Commit: "anySHA", - BuildTime: "", + Version: Version, + Commit: Commit, + Branch: Branch, }, ) @@ -57,17 +63,19 @@ func main() { // - [Concurrent Calls] // // Or, dig into the metrics of *functions called by* `indexHandler` +// // - [Request Rate Callee] +// // - [Error Ratio Callee] // -// autometrics:doc-end Generated documentation by Autometrics. +// autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 // //autometrics:doc --slo "API" --latency-target 99 --latency-ms 250 func indexHandler(w http.ResponseWriter, _ *http.Request) error { @@ -103,17 +111,19 @@ var handlerError = errors.New("failed to handle request") // - [Concurrent Calls] // // Or, dig into the metrics of *functions called by* `randomErrorHandler` +// // - [Request Rate Callee] +// // - [Error Ratio Callee] // -// autometrics:doc-end Generated documentation by Autometrics. +// autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%29+function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 // //autometrics:doc --slo "API" --success-target 90 func randomErrorHandler(w http.ResponseWriter, _ *http.Request) (err error) { diff --git a/examples/otel/scripts/build_server b/examples/otel/scripts/build_server index 865bd73..132fe64 100755 --- a/examples/otel/scripts/build_server +++ b/examples/otel/scripts/build_server @@ -4,8 +4,8 @@ set -euo pipefail VERSION=${VERSION:-development} COMMIT=`git log -1 --format="%H"` -BUILD_TIME=`date -Iseconds` +BRANCH=`git branch --show-current` SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" -go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.BuildTime=${BUILD_TIME}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go +go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.Branch=${BRANCH}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go diff --git a/examples/web/cmd/main.go b/examples/web/cmd/main.go index 7a5a7c3..5898f59 100644 --- a/examples/web/cmd/main.go +++ b/examples/web/cmd/main.go @@ -16,9 +16,11 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go -var Version = "development" -var Commit = "n/a" -var BuildTime string +var ( + Version = "development" + Commit = "n/a" + Branch string +) func main() { rand.Seed(time.Now().UnixNano()) @@ -30,9 +32,9 @@ func main() { nil, amImpl.DefBuckets, amImpl.BuildInfo{ - Version: Version, - Commit: Commit, - BuildTime: BuildTime, + Version: Version, + Commit: Commit, + Branch: Branch, }, ) @@ -61,17 +63,19 @@ func main() { // - [Concurrent Calls] // // Or, dig into the metrics of *functions called by* `indexHandler` +// // - [Request Rate Callee] +// // - [Error Ratio Callee] // -// autometrics:doc-end Generated documentation by Autometrics. +// autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60indexHandler%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60indexHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22indexHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 // //autometrics:doc --slo "API" --latency-target 99 --latency-ms 250 func indexHandler(w http.ResponseWriter, _ *http.Request) error { @@ -107,17 +111,19 @@ var handlerError = errors.New("failed to handle request") // - [Concurrent Calls] // // Or, dig into the metrics of *functions called by* `randomErrorHandler` +// // - [Request Rate Callee] +// // - [Error Ratio Callee] // -// autometrics:doc-end Generated documentation by Autometrics. +// autometrics:doc-end Generated documentation by Autometrics. // -// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 -// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 -// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 -// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 -// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0 -// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0 +// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 +// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60randomErrorHandler%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0 +// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60randomErrorHandler%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22randomErrorHandler%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 +// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 // //autometrics:doc --slo "API" --success-target 90 func randomErrorHandler(w http.ResponseWriter, _ *http.Request) (err error) { diff --git a/examples/web/cmd/main.go.orig b/examples/web/cmd/main.go.orig index 86057f7..34f0069 100644 --- a/examples/web/cmd/main.go.orig +++ b/examples/web/cmd/main.go.orig @@ -16,9 +16,11 @@ import ( // README //go:generate go run ../../../cmd/autometrics/main.go -var Version = "development" -var Commit = "n/a" -var BuildTime string +var ( + Version = "development" + Commit = "n/a" + Branch string +) func main() { rand.Seed(time.Now().UnixNano()) @@ -31,8 +33,8 @@ func main() { amImpl.DefBuckets, amImpl.BuildInfo{ Version: Version, - Commit: Commit, - BuildTime: BuildTime, + Commit: Commit, + Branch: Branch, }, ) diff --git a/examples/web/scripts/build_server b/examples/web/scripts/build_server index 865bd73..132fe64 100755 --- a/examples/web/scripts/build_server +++ b/examples/web/scripts/build_server @@ -4,8 +4,8 @@ set -euo pipefail VERSION=${VERSION:-development} COMMIT=`git log -1 --format="%H"` -BUILD_TIME=`date -Iseconds` +BRANCH=`git branch --show-current` SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" -go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.BuildTime=${BUILD_TIME}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go +go build -v -ldflags="-X 'main.Version=${VERSION}' -X 'main.Commit=${COMMIT}' -X 'main.Branch=${BRANCH}'" -o web-server ${SCRIPT_DIR}/../cmd/main.go diff --git a/internal/autometrics/prometheus_link_gen.go b/internal/autometrics/prometheus_link_gen.go index 7c963e5..d6f99e8 100644 --- a/internal/autometrics/prometheus_link_gen.go +++ b/internal/autometrics/prometheus_link_gen.go @@ -47,7 +47,7 @@ func (p Prometheus) makePrometheusUrl(query, comment string) url.URL { } func addBuildInfoLabels() string { - return fmt.Sprintf("* on (instance, job) group_left(%s, %s) %s", + return fmt.Sprintf("* on (instance, job) group_left(%s, %s) last_over_time(%s[1s])", prometheus.VersionLabel, prometheus.CommitLabel, prometheus.BuildInfoName, @@ -94,7 +94,12 @@ func latencyQuery(bucketName, labelKey, labelValue string) string { addBuildInfoLabels(), ) - return fmt.Sprintf("histogram_quantile(0.99, %s) or histogram_quantile(0.95, %s)", latency, latency) + return fmt.Sprintf( + "label_replace(histogram_quantile(0.99, %s), \"percentile_latency\", \"99\", \"\", \"\") or "+ + "label_replace(histogram_quantile(0.95, %s),\"percentile_latency\", \"95\", \"\", \"\")", + latency, + latency, + ) } func concurrentCallsQuery(gaugeName, labelKey, labelValue string) string { diff --git a/internal/build/build.go b/internal/build/build.go index 3e3cd57..3600686 100644 --- a/internal/build/build.go +++ b/internal/build/build.go @@ -1,7 +1,4 @@ package build // import "github.com/autometrics-dev/autometrics-go/internal/build" -// Time is the timestamp of the build, when made available through ldflags. -var Time string - // Version is the version string of the build, when made available through ldflags. var Version = "development" diff --git a/internal/generate/generate_test.go b/internal/generate/generate_test.go index bf3bbca..5936c29 100644 --- a/internal/generate/generate_test.go +++ b/internal/generate/generate_test.go @@ -62,12 +62,12 @@ func main() { "//\n" + "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + - "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + - "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + - "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0\n" + + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0\n" + + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"Service Test\" --success-target 99\n" + "func main() {\n" + @@ -148,12 +148,12 @@ func main() { "//\n" + "//\tautometrics:doc-end Generated documentation by Autometrics.\n" + "//\n" + - "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + - "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Ahistogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+or+histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + - "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29&g0.tab=0\n" + - "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+build_info%29%29&g0.tab=0\n" + + "// [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0\n" + + "// [Latency (95th and 99th percentiles)]: http://localhost:9090/graph?g0.expr=%23+95th+and+99th+percentile+latencies+%28in+seconds%29+for+the+%60main%60+function%0A%0Alabel_replace%28histogram_quantile%280.99%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C+%22percentile_latency%22%2C+%2299%22%2C+%22%22%2C+%22%22%29+or+label_replace%28histogram_quantile%280.95%2C+sum+by+%28le%2C+function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_duration_bucket%7Bfunction%3D%22main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29%2C%22percentile_latency%22%2C+%2295%22%2C+%22%22%2C+%22%22%29&g0.tab=0\n" + + "// [Concurrent Calls]: http://localhost:9090/graph?g0.expr=%23+Concurrent+calls+to+the+%60main%60+function%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28function_calls_concurrent%7Bfunction%3D%22main%22%7D+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Request Rate Callee]: http://localhost:9090/graph?g0.expr=%23+Rate+of+function+calls+emanating+from+%60main%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0\n" + + "// [Error Ratio Callee]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+function+emanating+from+%60main%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bcaller%3D%22main.main%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0\n" + "//\n" + "//autometrics:doc --slo \"API\" --latency-target 99.9 --latency-ms 500\n" + "func main() {\n" + diff --git a/pkg/autometrics/global_state.go b/pkg/autometrics/global_state.go index 10fb715..1e81ebd 100644 --- a/pkg/autometrics/global_state.go +++ b/pkg/autometrics/global_state.go @@ -1,8 +1,11 @@ package autometrics // import "github.com/autometrics-dev/autometrics-go/pkg/autometrics" +// These variables are describing the state of the application being autometricized, +// _not_ the build information of the binary + var version string var commit string -var buildTime string +var branch string // GetVersion returns the version of the codebase being instrumented. func GetVersion() string { @@ -24,12 +27,12 @@ func SetCommit(newCommit string) { commit = newCommit } -// GetBuildTime returns the build timestamp of the codebase being instrumented. -func GetBuildTime() string { - return buildTime +// GetBranch returns the branch of the build of the codebase being instrumented. +func GetBranch() string { + return branch } -// SetBuildTime sets the build timestamp of the codebase being instrumented. -func SetBuildTime(newBuildTime string) { - buildTime = newBuildTime +// SetBranch sets the branch of the build of the codebase being instrumented. +func SetBranch(newBranch string) { + branch = newBranch } diff --git a/pkg/autometrics/main.go b/pkg/autometrics/main.go index 6ba1f27..0a3fd40 100644 --- a/pkg/autometrics/main.go +++ b/pkg/autometrics/main.go @@ -42,19 +42,34 @@ type Context struct { // Only amImpl.Instrument should read this value. // Only amImpl.PreInstrument should write this value. // - // This value is only exported for the child packages "prometheus" and "otel" + // (amImpl is either the [Prometheus] or the [Open Telemetry] implementation) + // + // This value is only exported for the child packages [Prometheus] and [Open Telemetry] + // + // [Prometheus]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus + // [Open Telemetry]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel StartTime time.Time // CallInfo contains all the relevant data for caller information. // Only amImpl.Instrument should read this value. // Only amImpl.PreInstrument should write/read this value. // - // This value is only exported for the child packages "prometheus" and "otel" + // (amImpl is either the [Prometheus] or the [Open Telemetry] implementation) + // + // This value is only exported for the child packages [Prometheus] and [Open Telemetry] + // + // [Prometheus]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus + // [Open Telemetry]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel CallInfo CallInfo // BuildInfo contains all the relevant data for caller information. // Only amImpl.Instrument and PreInstrument should read this value. // Only amImpl.Init should write/read this value. // - // This value is only exported for the child packages "prometheus" and "otel" + // (amImpl is either the [Prometheus] or the [Open Telemetry] implementation) + // + // This value is only exported for the child packages [Prometheus] and [Open Telemetry] + // + // [Prometheus]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus + // [Open Telemetry]: https://godoc.org/github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel BuildInfo BuildInfo Context context.Context } @@ -77,8 +92,8 @@ type BuildInfo struct { Commit string // Version is the version of the code. Version string - // BuildTime is the timestamp of the build of the codebase. - BuildTime string + // Branch is the branch of the build of the codebase. + Branch string } func NewContext() Context { @@ -93,7 +108,7 @@ func NewContext() Context { func (c *Context) FillBuildInfo() { c.BuildInfo.Version = GetVersion() c.BuildInfo.Commit = GetCommit() - c.BuildInfo.BuildTime = GetBuildTime() + c.BuildInfo.Branch = GetBranch() } func (c Context) Validate(allowCustomLatencies bool) error { diff --git a/pkg/autometrics/otel/instrument.go b/pkg/autometrics/otel/instrument.go index cf2a8aa..9217422 100644 --- a/pkg/autometrics/otel/instrument.go +++ b/pkg/autometrics/otel/instrument.go @@ -51,7 +51,7 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(SloNameLabel).String(sloName), attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), - attribute.Key(BuildTimeLabel).String(ctx.BuildInfo.BuildTime), + attribute.Key(BranchLabel).String(ctx.BuildInfo.Branch), }...) functionCallsDuration.Record(ctx.Context, time.Since(ctx.StartTime).Seconds(), []attribute.KeyValue{ @@ -63,7 +63,7 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(SloNameLabel).String(sloName), attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), - attribute.Key(BuildTimeLabel).String(ctx.BuildInfo.BuildTime), + attribute.Key(BranchLabel).String(ctx.BuildInfo.Branch), }...) if ctx.TrackConcurrentCalls { @@ -74,7 +74,7 @@ func Instrument(ctx *autometrics.Context, err *error) { attribute.Key(CallerLabel).String(callerLabel), attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), - attribute.Key(BuildTimeLabel).String(ctx.BuildInfo.BuildTime), + attribute.Key(BranchLabel).String(ctx.BuildInfo.Branch), }...) } } @@ -101,7 +101,7 @@ func PreInstrument(ctx *autometrics.Context) *autometrics.Context { attribute.Key(CallerLabel).String(callerLabel), attribute.Key(CommitLabel).String(ctx.BuildInfo.Commit), attribute.Key(VersionLabel).String(ctx.BuildInfo.Version), - attribute.Key(BuildTimeLabel).String(ctx.BuildInfo.BuildTime), + attribute.Key(BranchLabel).String(ctx.BuildInfo.Branch), }...) } diff --git a/pkg/autometrics/otel/otel.go b/pkg/autometrics/otel/otel.go index e148e09..7fe82dd 100644 --- a/pkg/autometrics/otel/otel.go +++ b/pkg/autometrics/otel/otel.go @@ -66,8 +66,8 @@ const ( CommitLabel = "commit" // VersionLabel is the openTelemetry attribute that describes the version of the monitored codebase. VersionLabel = "version" - // BuildTimeLabel is the openTelemetry attribute that describes the timestamp of the build of the monitored codebase. - BuildTimeLabel = "build_time" + // BranchLabel is the openTelemetry attribute that describes the branch of the build of the monitored codebase. + BranchLabel = "branch" ) func completeMeterName(meterName string) string { @@ -89,7 +89,7 @@ type BuildInfo = autometrics.BuildInfo func Init(meterName string, histogramBuckets []float64, buildInformation BuildInfo) error { autometrics.SetCommit(buildInformation.Commit) autometrics.SetVersion(buildInformation.Version) - autometrics.SetBuildTime(buildInformation.BuildTime) + autometrics.SetBranch(buildInformation.Branch) exporter, err := prometheus.New( // The units are removed from the exporter so that the names of the @@ -144,7 +144,7 @@ func Init(meterName string, histogramBuckets []float64, buildInformation BuildIn []attribute.KeyValue{ attribute.Key(CommitLabel).String(buildInformation.Commit), attribute.Key(VersionLabel).String(buildInformation.Version), - attribute.Key(BuildTimeLabel).String(buildInformation.BuildTime), + attribute.Key(BranchLabel).String(buildInformation.Branch), }...) return nil diff --git a/pkg/autometrics/prometheus/instrument.go b/pkg/autometrics/prometheus/instrument.go index e710732..42b8ca3 100644 --- a/pkg/autometrics/prometheus/instrument.go +++ b/pkg/autometrics/prometheus/instrument.go @@ -47,7 +47,7 @@ func Instrument(ctx *autometrics.Context, err *error) { ResultLabel: result, TargetSuccessRateLabel: successObjective, SloNameLabel: sloName, - BuildTimeLabel: ctx.BuildInfo.BuildTime, + BranchLabel: ctx.BuildInfo.Branch, CommitLabel: ctx.BuildInfo.Commit, VersionLabel: ctx.BuildInfo.Version, }).Inc() @@ -58,7 +58,7 @@ func Instrument(ctx *autometrics.Context, err *error) { TargetLatencyLabel: latencyTarget, TargetSuccessRateLabel: latencyObjective, SloNameLabel: sloName, - BuildTimeLabel: ctx.BuildInfo.BuildTime, + BranchLabel: ctx.BuildInfo.Branch, CommitLabel: ctx.BuildInfo.Commit, VersionLabel: ctx.BuildInfo.Version, }).Observe(time.Since(ctx.StartTime).Seconds()) @@ -68,7 +68,7 @@ func Instrument(ctx *autometrics.Context, err *error) { FunctionLabel: ctx.CallInfo.FuncName, ModuleLabel: ctx.CallInfo.ModuleName, CallerLabel: callerLabel, - BuildTimeLabel: ctx.BuildInfo.BuildTime, + BranchLabel: ctx.BuildInfo.Branch, CommitLabel: ctx.BuildInfo.Commit, VersionLabel: ctx.BuildInfo.Version, }).Dec() @@ -93,7 +93,7 @@ func PreInstrument(ctx *autometrics.Context) *autometrics.Context { FunctionLabel: ctx.CallInfo.FuncName, ModuleLabel: ctx.CallInfo.ModuleName, CallerLabel: callerLabel, - BuildTimeLabel: ctx.BuildInfo.BuildTime, + BranchLabel: ctx.BuildInfo.Branch, CommitLabel: ctx.BuildInfo.Commit, VersionLabel: ctx.BuildInfo.Version, }).Inc() diff --git a/pkg/autometrics/prometheus/prometheus.go b/pkg/autometrics/prometheus/prometheus.go index f7514ff..26fed54 100644 --- a/pkg/autometrics/prometheus/prometheus.go +++ b/pkg/autometrics/prometheus/prometheus.go @@ -57,8 +57,8 @@ const ( CommitLabel = "commit" // VersionLabel is the prometheus label that describes the version of the monitored codebase. VersionLabel = "version" - // BuildTimeLabel is the prometheus label that describes the timestamp of the build of the monitored codebase. - BuildTimeLabel = "build_time" + // BranchLabel is the prometheus label that describes the branch of the build of the monitored codebase. + BranchLabel = "branch" ) // BuildInfo holds meta information about the build of the instrumented code. @@ -79,24 +79,24 @@ type BuildInfo = autometrics.BuildInfo func Init(reg *prometheus.Registry, histogramBuckets []float64, buildInformation BuildInfo) error { autometrics.SetCommit(buildInformation.Commit) autometrics.SetVersion(buildInformation.Version) - autometrics.SetBuildTime(buildInformation.BuildTime) + autometrics.SetBranch(buildInformation.Branch) functionCallsCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: FunctionCallsCountName, - }, []string{FunctionLabel, ModuleLabel, CallerLabel, ResultLabel, TargetSuccessRateLabel, SloNameLabel, CommitLabel, VersionLabel, BuildTimeLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, ResultLabel, TargetSuccessRateLabel, SloNameLabel, CommitLabel, VersionLabel, BranchLabel}) functionCallsDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: FunctionCallsDurationName, Buckets: histogramBuckets, - }, []string{FunctionLabel, ModuleLabel, CallerLabel, TargetLatencyLabel, TargetSuccessRateLabel, SloNameLabel, CommitLabel, VersionLabel, BuildTimeLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, TargetLatencyLabel, TargetSuccessRateLabel, SloNameLabel, CommitLabel, VersionLabel, BranchLabel}) functionCallsConcurrent = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: FunctionCallsConcurrentName, - }, []string{FunctionLabel, ModuleLabel, CallerLabel, CommitLabel, VersionLabel, BuildTimeLabel}) + }, []string{FunctionLabel, ModuleLabel, CallerLabel, CommitLabel, VersionLabel, BranchLabel}) buildInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: BuildInfoName, - }, []string{CommitLabel, VersionLabel, BuildTimeLabel}) + }, []string{CommitLabel, VersionLabel, BranchLabel}) if reg != nil { reg.MustRegister(functionCallsCount) @@ -113,7 +113,7 @@ func Init(reg *prometheus.Registry, histogramBuckets []float64, buildInformation buildInfo.With(prometheus.Labels{ CommitLabel: buildInformation.Commit, VersionLabel: buildInformation.Version, - BuildTimeLabel: buildInformation.BuildTime, + BranchLabel: buildInformation.Branch, }).Set(1) return nil diff --git a/scripts/build_generator b/scripts/build_generator index e6dceaf..8dd5121 100755 --- a/scripts/build_generator +++ b/scripts/build_generator @@ -4,7 +4,6 @@ set -euo pipefail BUILD_PACK="github.com/autometrics-dev/autometrics-go/internal/build" VERSION=`git describe --tags` -BUILD_TIME=`date -u` SCRIPT_DIR="$( dirname -- "$( readlink -f -- "$0"; )"; )" -go build -v -ldflags="-X '${BUILD_PACK}.Version=${VERSION}' -X '${BUILD_PACK}.Time=${BUILD_TIME}'" ${SCRIPT_DIR}/../cmd/autometrics/main.go +go build -v -ldflags="-X '${BUILD_PACK}.Version=${VERSION}'" ${SCRIPT_DIR}/../cmd/autometrics/main.go From 7a525eeac1c78168c86e050e67fafc992aab4644 Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Thu, 4 May 2023 14:58:16 +0200 Subject: [PATCH 09/10] Rename amImpl to autometrics in end-user docs There is little reason to add a different name in the docs now that there is a single import again --- README.md | 46 +++++++++++++++++------------------ examples/otel/cmd/main.go | 36 ++++++++++++--------------- examples/web/cmd/main.go | 36 ++++++++++++--------------- examples/web/cmd/main.go.orig | 8 +++--- 4 files changed, 59 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index fb7b403..8e8f010 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ docker compose -f docker-compose.prometheus-example.yaml up ``` And then explore the generated links by opening the [main -file](./examples/web/cmd/main.go). +file](./examples/web/cmd/main.go) in your editor. ## How to use @@ -52,7 +52,7 @@ In the main entrypoint of your program, you need to both add package ``` go import ( - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" ) ``` @@ -62,13 +62,13 @@ And then in your main function initialize the metrics // Everything in BuildInfo is optional. // You can also use any string variable whose value is // injected at build time by ldflags. - amImpl.Init( + autometrics.Init( nil, - amImpl.DefBuckets, - amImpl.BuildInfo{ + autometrics.DefBuckets, + autometrics.BuildInfo{ Version: "0.4.0", Commit: "anySHA", - BuildTime: "", + Branch: "", }, ) ``` @@ -148,19 +148,19 @@ For Prometheus the shortest way is to add the handler code in your main entrypoi ``` go import ( - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) func main() { - amImpl.Init( + autometrics.Init( nil, - amImpl.DefBuckets, - amImpl.BuildInfo{ + autometrics.DefBuckets, + autometrics.BuildInfo{ Version: "0.4.0", Commit: "anySHA", - BuildTime: "", + Branch: "", }, ) http.Handle("/metrics", promhttp.Handler()) @@ -182,11 +182,11 @@ func RouteHandler(args interface{}) (err error) { } ``` -Then **you need to add** the [bundled](./configs/autometrics.rules.yml) +Then **you need to add** the [bundled](./configs/shared/autometrics.rules.yml) recording rules to your prometheus configuration. The valid arguments for alert generation are: -- `--slo` (*MANDATORY*): name of the service for which the objective is relevant +- `--slo` (*MANDATORY* for alert generation): name of the service for which the objective is relevant - `--success-rate` : target success rate of the function, between 0 and 100 (you must name the `error` return value of the function for detection to work.) - `--latency-ms` : maximum latency allowed for the function, in milliseconds. @@ -196,7 +196,7 @@ The valid arguments for alert generation are: > **Warning** > The generator will error out if you use targets that are not -supported by the bundled [Alerting rules file](./configs/autometrics.rules.yml). +supported by the bundled [Alerting rules file](./configs/shared/autometrics.rules.yml). Support for custom target is planned but not present at the moment ## (OPTIONAL) OpenTelemetry Support @@ -204,26 +204,26 @@ Support for custom target is planned but not present at the moment Autometrics supports using OpenTelemetry with a prometheus exporter instead of using Prometheus to publish the metrics. The changes you need to make are: -- change where the `amImpl` import points to +- change where the `autometrics` import points to ```patch import ( -- amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" -+ amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" +- autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" ++ autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" ) ``` -- change the call to `amImpl.Init` to the new signature: instead of a registry, +- change the call to `autometrics.Init` to the new signature: instead of a registry, the `Init` function takes a meter name for the `otel_scope` label of the exported metric. You can use the name of the application or its version for example ``` patch - amImpl.Init( + autometrics.Init( - nil, + "myApp/v2/prod", - amImpl.DefBuckets, - amImpl.BuildInfo{ + autometrics.DefBuckets, + autometrics.BuildInfo{ Version: "2.1.37", Commit: "anySHA", - BuildTime: "", + Branch: "", }, ) ``` @@ -264,4 +264,4 @@ The alerting system for SLOs that Autometrics uses is based on [Sloth](https://github.com/slok/sloth), and it has native Go types for marshalling/unmarshalling rules, so it should be possible to provide an extra binary in this repository, that only takes care of generating a new [rules -file](./configs/autometrics.rules.yml) with custom objectives. +file](./configs/shared/autometrics.rules.yml) with custom objectives. diff --git a/examples/otel/cmd/main.go b/examples/otel/cmd/main.go index bbe1515..424a6a6 100644 --- a/examples/otel/cmd/main.go +++ b/examples/otel/cmd/main.go @@ -8,7 +8,7 @@ import ( "net/http" "time" - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/otel" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -28,10 +28,10 @@ func main() { // Everything in BuildInfo is optional. // You can also use any string variable whose value is // injected at build time by ldflags. - amImpl.Init( + autometrics.Init( "web-server", - amImpl.DefBuckets, - amImpl.BuildInfo{ + autometrics.DefBuckets, + autometrics.BuildInfo{ Version: Version, Commit: Commit, Branch: Branch, @@ -63,12 +63,10 @@ func main() { // - [Concurrent Calls] // // Or, dig into the metrics of *functions called by* `indexHandler` -// // - [Request Rate Callee] -// // - [Error Ratio Callee] // -// autometrics:doc-end Generated documentation by Autometrics. +// autometrics:doc-end Generated documentation by Autometrics. // // [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 // [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 @@ -79,11 +77,11 @@ func main() { // //autometrics:doc --slo "API" --latency-target 99 --latency-ms 250 func indexHandler(w http.ResponseWriter, _ *http.Request) error { - defer amImpl.Instrument(amImpl.PreInstrument(amImpl.NewContext( - amImpl.WithConcurrentCalls(true), - amImpl.WithCallerName(true), - amImpl.WithSloName("API"), - amImpl.WithAlertLatency(250000000*time.Nanosecond, 99), + defer autometrics.Instrument(autometrics.PreInstrument(autometrics.NewContext( + autometrics.WithConcurrentCalls(true), + autometrics.WithCallerName(true), + autometrics.WithSloName("API"), + autometrics.WithAlertLatency(250000000*time.Nanosecond, 99), )), nil) //autometrics:defer time.Sleep(time.Duration(rand.Intn(500)) * time.Millisecond) @@ -111,12 +109,10 @@ var handlerError = errors.New("failed to handle request") // - [Concurrent Calls] // // Or, dig into the metrics of *functions called by* `randomErrorHandler` -// // - [Request Rate Callee] -// // - [Error Ratio Callee] // -// autometrics:doc-end Generated documentation by Autometrics. +// autometrics:doc-end Generated documentation by Autometrics. // // [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 // [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 @@ -127,11 +123,11 @@ var handlerError = errors.New("failed to handle request") // //autometrics:doc --slo "API" --success-target 90 func randomErrorHandler(w http.ResponseWriter, _ *http.Request) (err error) { - defer amImpl.Instrument(amImpl.PreInstrument(amImpl.NewContext( - amImpl.WithConcurrentCalls(true), - amImpl.WithCallerName(true), - amImpl.WithSloName("API"), - amImpl.WithAlertSuccess(90), + defer autometrics.Instrument(autometrics.PreInstrument(autometrics.NewContext( + autometrics.WithConcurrentCalls(true), + autometrics.WithCallerName(true), + autometrics.WithSloName("API"), + autometrics.WithAlertSuccess(90), )), &err) //autometrics:defer isErr := rand.Intn(2) == 0 diff --git a/examples/web/cmd/main.go b/examples/web/cmd/main.go index 5898f59..cab7efd 100644 --- a/examples/web/cmd/main.go +++ b/examples/web/cmd/main.go @@ -8,7 +8,7 @@ import ( "net/http" "time" - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -28,10 +28,10 @@ func main() { // Everything in BuildInfo is optional. // You can also use any string variable whose value is // injected at build time by ldflags. - amImpl.Init( + autometrics.Init( nil, - amImpl.DefBuckets, - amImpl.BuildInfo{ + autometrics.DefBuckets, + autometrics.BuildInfo{ Version: Version, Commit: Commit, Branch: Branch, @@ -63,12 +63,10 @@ func main() { // - [Concurrent Calls] // // Or, dig into the metrics of *functions called by* `indexHandler` -// // - [Request Rate Callee] -// // - [Error Ratio Callee] // -// autometrics:doc-end Generated documentation by Autometrics. +// autometrics:doc-end Generated documentation by Autometrics. // // [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60indexHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 // [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60indexHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22indexHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 @@ -79,11 +77,11 @@ func main() { // //autometrics:doc --slo "API" --latency-target 99 --latency-ms 250 func indexHandler(w http.ResponseWriter, _ *http.Request) error { - defer amImpl.Instrument(amImpl.PreInstrument(amImpl.NewContext( - amImpl.WithConcurrentCalls(true), - amImpl.WithCallerName(true), - amImpl.WithSloName("API"), - amImpl.WithAlertLatency(250000000*time.Nanosecond, 99), + defer autometrics.Instrument(autometrics.PreInstrument(autometrics.NewContext( + autometrics.WithConcurrentCalls(true), + autometrics.WithCallerName(true), + autometrics.WithSloName("API"), + autometrics.WithAlertLatency(250000000*time.Nanosecond, 99), )), nil) //autometrics:defer time.Sleep(time.Duration(rand.Intn(500)) * time.Millisecond) @@ -111,12 +109,10 @@ var handlerError = errors.New("failed to handle request") // - [Concurrent Calls] // // Or, dig into the metrics of *functions called by* `randomErrorHandler` -// // - [Request Rate Callee] -// // - [Error Ratio Callee] // -// autometrics:doc-end Generated documentation by Autometrics. +// autometrics:doc-end Generated documentation by Autometrics. // // [Request Rate]: http://localhost:9090/graph?g0.expr=%23+Rate+of+calls+to+the+%60randomErrorHandler%60+function+per+second%2C+averaged+over+5+minute+windows%0A%0Asum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29&g0.tab=0 // [Error Ratio]: http://localhost:9090/graph?g0.expr=%23+Percentage+of+calls+to+the+%60randomErrorHandler%60+function+that+return+errors%2C+averaged+over+5+minute+windows%0A%0A%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%2Cresult%3D%22error%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29+%2F+%28sum+by+%28function%2C+module%2C+version%2C+commit%29+%28rate%28function_calls_count%7Bfunction%3D%22randomErrorHandler%22%7D%5B5m%5D%29+%2A+on+%28instance%2C+job%29+group_left%28version%2C+commit%29+last_over_time%28build_info%5B1s%5D%29%29%29&g0.tab=0 @@ -127,11 +123,11 @@ var handlerError = errors.New("failed to handle request") // //autometrics:doc --slo "API" --success-target 90 func randomErrorHandler(w http.ResponseWriter, _ *http.Request) (err error) { - defer amImpl.Instrument(amImpl.PreInstrument(amImpl.NewContext( - amImpl.WithConcurrentCalls(true), - amImpl.WithCallerName(true), - amImpl.WithSloName("API"), - amImpl.WithAlertSuccess(90), + defer autometrics.Instrument(autometrics.PreInstrument(autometrics.NewContext( + autometrics.WithConcurrentCalls(true), + autometrics.WithCallerName(true), + autometrics.WithSloName("API"), + autometrics.WithAlertSuccess(90), )), &err) //autometrics:defer isErr := rand.Intn(2) == 0 diff --git a/examples/web/cmd/main.go.orig b/examples/web/cmd/main.go.orig index 34f0069..536c255 100644 --- a/examples/web/cmd/main.go.orig +++ b/examples/web/cmd/main.go.orig @@ -8,7 +8,7 @@ import ( "net/http" "time" - amImpl "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" + autometrics "github.com/autometrics-dev/autometrics-go/pkg/autometrics/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -28,10 +28,10 @@ func main() { // Everything in BuildInfo is optional. // You can also use any string variable whose value is // injected at build time by ldflags. - amImpl.Init( + autometrics.Init( nil, - amImpl.DefBuckets, - amImpl.BuildInfo{ + autometrics.DefBuckets, + autometrics.BuildInfo{ Version: Version, Commit: Commit, Branch: Branch, From c8f77267ee3d443711d51b7ff91035f4b6931c53 Mon Sep 17 00:00:00 2001 From: Gerry Agbobada Date: Thu, 4 May 2023 15:41:55 +0200 Subject: [PATCH 10/10] Rename amImpl import in example doc --- examples/otel/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/otel/README.md b/examples/otel/README.md index 8808c5a..1f0ed30 100644 --- a/examples/otel/README.md +++ b/examples/otel/README.md @@ -7,9 +7,9 @@ The only difference is that the metrics implementation used is OpenTelemetry with a Prometheus exporter instead of using a Prometheus only client crate. You can notice the 3 differences that are mentionned in the top-level README: -- The amImpl import has been changed to `otel` +- The autometrics import has been changed to `otel` - The autometrics call in the Go generator has the `--otel` flag -- The `amImpl.Init` call uses a different first argument, with the name of the +- The `autometrics.Init` call uses a different first argument, with the name of the OpenTelemetry scope to use ## Quickstart