health.go

// Package health is a library that enables *async* dependency health checking for services running on an orchestrated container platform such as kubernetes or mesos.
//
// For additional overview, documentation and contribution guidelines, refer to the project's "README.md".
//
// For example usage, refer to https://github.com/InVisionApp/go-health/tree/master/examples/simple-http-server.
package health

import (
	"errors"
	"sync"
	"time"

	"github.com/InVisionApp/go-logger"
)

//go:generate counterfeiter -o ./fakes/icheckable.go . ICheckable

var (
	// ErrNoAddCfgWhenActive is returned when you attempt to add check(s) to an already active healthcheck instance
	ErrNoAddCfgWhenActive = errors.New("Unable to add new check configuration(s) while healthcheck is active")

	// ErrAlreadyRunning is returned when you attempt to "h.Start()" an already running healthcheck
	ErrAlreadyRunning = errors.New("Healthcheck is already running - nothing to start")

	// ErrAlreadyStopped is returned when you attempt to "h.Stop()" a non-running healthcheck instance
	ErrAlreadyStopped = errors.New("Healthcheck is not running - nothing to stop")

	// ErrEmptyConfigs is returned when you attempt to add an empty slice of configs via "h.AddChecks()"
	ErrEmptyConfigs = errors.New("Configs appears to be empty - nothing to add")
)

// The IHealth interface can be useful if you plan on replacing the actual health
// checker with a mock during testing. Otherwise, you can set "hc.Disable = true"
// after instantiation.
type IHealth interface {
	AddChecks(cfgs []*Config) error
	AddCheck(cfg *Config) error
	Start() error
	Stop() error
	State() (map[string]State, bool, error)
	Failed() bool
}

// ICheckable is an interface implemented by a number of bundled checkers such
// as "MySQLChecker", "RedisChecker" and "HTTPChecker". By implementing the
// interface, you can feed your own custom checkers into the health library.
type ICheckable interface {
	// Status allows you to return additional data as an "interface{}" and "error"
	// to signify that the check has failed. If "interface{}" is non-nil, it will
	// be exposed under "State.Details" for that particular check.
	Status() (interface{}, error)
}

// IStatusListener is an interface that handles health check failures and
// recoveries, primarily for stats recording purposes
type IStatusListener interface {
	// HealthCheckFailed is a function that handles the failure of a health
	// check event. This function is called when a health check state
	// transitions from passing to failing.
	// 	* entry - The recorded state of the health check that triggered the failure
	HealthCheckFailed(entry *State)

	// HealthCheckRecovered is a function that handles the recovery of a failed
	// health check.
	// 	* entry - The recorded state of the health check that triggered the recovery
	// 	* recordedFailures - the total failed health checks that lapsed
	// 	  between the failure and recovery
	//	* failureDurationSeconds - the lapsed time, in seconds, of the recovered failure
	HealthCheckRecovered(entry *State, recordedFailures int64, failureDurationSeconds float64)
}

// Config is a struct used for defining and configuring checks.
type Config struct {
	// Name of the check
	Name string

	// Checker instance used to perform health check
	Checker ICheckable

	// Interval between health checks
	Interval time.Duration

	// Fatal marks a failing health check so that the
	// entire health check request fails with a 500 error
	Fatal bool

	// Hook that gets called when this health check is complete
	OnComplete func(state *State)
}

// State is a struct that contains the results of the latest
// run of a particular check.
type State struct {
	// Name of the health check
	Name string `json:"name"`

	// Status of the health check state ("ok" or "failed")
	Status string `json:"status"`

	// Err is the error returned from a failed health check
	Err string `json:"error,omitempty"`

	// Fatal shows if the check will affect global result
	Fatal bool `json:"fatal,omitempty"`

	// Details contains more contextual detail about a
	// failing health check.
	Details interface{} `json:"details,omitempty"` // contains JSON message (that can be marshaled)

	// CheckTime is the time of the last health check
	CheckTime time.Time `json:"check_time"`

	ContiguousFailures int64     `json:"num_failures"`     // the number of failures that occurred in a row
	TimeOfFirstFailure time.Time `json:"first_failure_at"` // the time of the initial transitional failure for any given health check
}

// indicates state is failure
func (s *State) isFailure() bool {
	return s.Status == "failed"
}

// Health contains internal go-health internal structures.
type Health struct {
	Logger log.Logger

	// StatusListener will report failures and recoveries
	StatusListener IStatusListener

	active     *sBool // indicates whether the healthcheck is actively running
	configs    []*Config
	states     map[string]State
	statesLock sync.Mutex
	runners    map[string]chan struct{} // contains map of active runners w/ a stop channel
}

// New returns a new instance of the Health struct.
func New() *Health {
	return &Health{
		Logger:     log.NewSimple(),
		configs:    make([]*Config, 0),
		states:     make(map[string]State, 0),
		runners:    make(map[string]chan struct{}, 0),
		active:     newBool(),
		statesLock: sync.Mutex{},
	}
}

// DisableLogging will disable all logging by inserting the noop logger.
func (h *Health) DisableLogging() {
	h.Logger = log.NewNoop()
}

// AddChecks is used for adding multiple check definitions at once (as opposed
// to adding them sequentially via "AddCheck()").
func (h *Health) AddChecks(cfgs []*Config) error {
	if h.active.val() {
		return ErrNoAddCfgWhenActive
	}

	h.configs = append(h.configs, cfgs...)

	return nil
}

// AddCheck is used for adding a single check definition to the current health
// instance.
func (h *Health) AddCheck(cfg *Config) error {
	if h.active.val() {
		return ErrNoAddCfgWhenActive
	}

	h.configs = append(h.configs, cfg)
	return nil
}

// Start will start all of the defined health checks. Each of the checks run in
// their own goroutines (as "time.Ticker").
func (h *Health) Start() error {
	if h.active.val() {
		return ErrAlreadyRunning
	}

	// if there are no check configs, this is a noop
	if len(h.configs) < 1 {
		return nil
	}

	for _, c := range h.configs {
		h.Logger.WithFields(log.Fields{"name": c.Name}).Debug("Starting checker")
		ticker := time.NewTicker(c.Interval)
		stop := make(chan struct{})

		h.startRunner(c, ticker, stop)

		h.runners[c.Name] = stop
	}

	// Checkers are now actively running
	h.active.setTrue()

	return nil
}

// Stop will cause all of the running health checks to be stopped. Additionally,
// all existing check states will be reset.
func (h *Health) Stop() error {
	if !h.active.val() {
		return ErrAlreadyStopped
	}

	for name, stop := range h.runners {
		h.Logger.WithFields(log.Fields{"name": name}).Debug("Stopping checker")
		close(stop)
	}

	// Reset runner map
	h.runners = make(map[string]chan struct{}, 0)

	// Reset states
	h.safeResetStates()

	return nil
}

// State will return a map of all current healthcheck states (thread-safe), a
// bool indicating whether the healthcheck has fully failed and a potential error.
//
// The returned structs can be used for figuring out additional analytics or
// used for building your own status handler (as opposed to using the built-in
// "hc.HandlerBasic" or "hc.HandlerJSON").
//
// The map key is the name of the check.
func (h *Health) State() (map[string]State, bool, error) {
	return h.safeGetStates(), h.Failed(), nil
}

// Failed will return the basic state of overall health. This should be used when
// details about the failure are not needed
func (h *Health) Failed() bool {
	for _, val := range h.safeGetStates() {
		if val.Fatal && val.isFailure() {
			return true
		}
	}
	return false
}

func (h *Health) startRunner(cfg *Config, ticker *time.Ticker, stop <-chan struct{}) {

	// function to execute and collect check data
	checkFunc := func() {
		data, err := cfg.Checker.Status()

		stateEntry := &State{
			Name:      cfg.Name,
			Status:    "ok",
			Details:   data,
			CheckTime: time.Now(),
			Fatal:     cfg.Fatal,
		}

		if err != nil {
			h.Logger.WithFields(log.Fields{
				"check": cfg.Name,
				"fatal": cfg.Fatal,
				"err":   err,
			}).Error("healthcheck has failed")

			stateEntry.Err = err.Error()
			stateEntry.Status = "failed"
		}

		h.safeUpdateState(stateEntry)

		if cfg.OnComplete != nil {
			go cfg.OnComplete(stateEntry)
		}
	}

	go func() {
		defer ticker.Stop()

		// execute once so that it is immediate
		checkFunc()

		// all following executions
	RunLoop:
		for {
			select {
			case <-ticker.C:
				checkFunc()
			case <-stop:
				break RunLoop
			}
		}

		h.Logger.WithFields(log.Fields{"name": cfg.Name}).Debug("Checker exiting")
	}()
}

// resets the states in a concurrency-safe manner
func (h *Health) safeResetStates() {
	h.statesLock.Lock()
	defer h.statesLock.Unlock()
	h.states = make(map[string]State, 0)
}

// updates the check state in a concurrency-safe manner
func (h *Health) safeUpdateState(stateEntry *State) {
	// dispatch any status listeners
	h.handleStatusListener(stateEntry)

	// update states here
	h.statesLock.Lock()
	defer h.statesLock.Unlock()

	h.states[stateEntry.Name] = *stateEntry
}

// get all states in a concurrency-safe manner
func (h *Health) safeGetStates() map[string]State {
	h.statesLock.Lock()
	defer h.statesLock.Unlock()

	// deep copy h.states to avoid race
	statesCopy := make(map[string]State, 0)

	for k, v := range h.states {
		statesCopy[k] = v
	}

	return statesCopy
}

// if a status listener is attached
func (h *Health) handleStatusListener(stateEntry *State) {
	// get the previous state
	h.statesLock.Lock()
	prevState := h.states[stateEntry.Name]
	h.statesLock.Unlock()

	// state is failure
	if stateEntry.isFailure() {
		if !prevState.isFailure() {
			// new failure: previous state was ok
			if h.StatusListener != nil {
				go h.StatusListener.HealthCheckFailed(stateEntry)
			}

			stateEntry.TimeOfFirstFailure = time.Now()
		} else {
			// carry the time of first failure from the previous state
			stateEntry.TimeOfFirstFailure = prevState.TimeOfFirstFailure
		}
		stateEntry.ContiguousFailures = prevState.ContiguousFailures + 1
	} else if prevState.isFailure() {
		// recovery, previous state was failure
		failureSeconds := time.Now().Sub(prevState.TimeOfFirstFailure).Seconds()

		if h.StatusListener != nil {
			go h.StatusListener.HealthCheckRecovered(stateEntry, prevState.ContiguousFailures, failureSeconds)
		}
	}
}