Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/worker error budget #1286

Merged
merged 45 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ad67a7f
WIP
tylerwowen Oct 3, 2023
467362e
Add registry for custom metric name
tylerwowen Oct 4, 2023
e78ac55
handle default
tylerwowen Oct 4, 2023
2ba9133
Add metrics to Workers
killertiger Oct 5, 2023
73b2afa
Change HostTerminatorTest
Oct 6, 2023
25de1c8
Fix HostTerminatorTest
killertiger Oct 6, 2023
6e28710
Replace consts for the value
killertiger Oct 6, 2023
ee9612d
WIP
tylerwowen Oct 3, 2023
87d1e04
Add registry for custom metric name
tylerwowen Oct 4, 2023
d63cc64
handle default
tylerwowen Oct 4, 2023
9bf5282
Add metrics to Workers
killertiger Oct 5, 2023
467ffa0
Change HostTerminatorTest
Oct 6, 2023
4adf35b
Fix HostTerminatorTest
killertiger Oct 6, 2023
b72ee6b
Replace consts for the value
killertiger Oct 6, 2023
aafbb04
Merge branch 'feature/worker_error_budget' of https://github.com/pint…
killertiger Oct 9, 2023
be2990f
Add log line for error budget
killertiger Oct 12, 2023
4f5bb36
WIP
tylerwowen Oct 3, 2023
432a094
Add registry for custom metric name
tylerwowen Oct 4, 2023
b2a79a2
handle default
tylerwowen Oct 4, 2023
d51f3e9
Add metrics to Workers
killertiger Oct 5, 2023
072fec0
Change HostTerminatorTest
Oct 6, 2023
ae6fc42
Fix HostTerminatorTest
killertiger Oct 6, 2023
d4d013d
Replace consts for the value
killertiger Oct 6, 2023
38afb5c
WIP
tylerwowen Oct 3, 2023
6aed6de
Add registry for custom metric name
tylerwowen Oct 4, 2023
8d9fd21
Add metrics to Workers
killertiger Oct 5, 2023
df14247
Replace consts for the value
killertiger Oct 6, 2023
b559df4
Add log line for error budget
killertiger Oct 12, 2023
4913a33
Merge branch 'feature/worker_error_budget' of https://github.com/pint…
killertiger Oct 12, 2023
2eb1270
anything
killertiger Oct 12, 2023
461f991
remove uuid from the metric
killertiger Oct 12, 2023
ead95ed
PinStatsNamingConvention Custom name
killertiger Oct 13, 2023
b1dae04
Refactoring Error Budget metrics
killertiger Oct 13, 2023
c0380ec
Removing old code from Error Budget
killertiger Oct 13, 2023
c0a02d2
Revert changes for error budget
killertiger Oct 13, 2023
01947c2
anything
killertiger Oct 13, 2023
b192682
remove space
killertiger Oct 13, 2023
c95dc47
Merge branch 'master' into feature/worker_error_budget
killertiger Oct 13, 2023
92ed07c
remove cosmetic changes
killertiger Oct 13, 2023
5450ad5
Merge branch 'feature/worker_error_budget' of https://github.com/pint…
killertiger Oct 13, 2023
856ebb4
Removing comesmetic changes
killertiger Oct 13, 2023
042858c
Add consts to metric
killertiger Oct 13, 2023
99e4afb
Refactor ErrorBudget report using conts
killertiger Oct 14, 2023
086717a
Merge branch 'master' into feature/worker_error_budget
killertiger Oct 18, 2023
423c59f
Revert SimpleAgentJanitor and change on AgentJanitor
killertiger Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package com.pinterest.deployservice.metrics;

import static com.pinterest.teletraan.universal.metrics.micrometer.PinStatsNamingConvention.CUSTOM_NAME_PREFIX;

public class MeterConstants {
public final static String ERROR_BUDGET_METRIC_NAME = CUSTOM_NAME_PREFIX + "error-budget.counters";
public final static String ERROR_BUDGET_TAG_NAME_METHOD_NAME = "method_name";
public final static String ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE = "response_type";
public final static String ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_SUCCESS = "success";
public final static String ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_FAILURE = "failure";
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
import com.pinterest.deployservice.bean.HostAgentBean;
import com.pinterest.deployservice.bean.HostBean;
import com.pinterest.deployservice.bean.HostState;
import com.pinterest.deployservice.metrics.MeterConstants;
import com.pinterest.deployservice.rodimus.RodimusManager;

import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Metrics;

/**
Expand All @@ -54,6 +56,8 @@ public class AgentJanitor extends SimpleAgentJanitor {
private final int agentlessHostBatchSize = 300;
private final AtomicInteger unreachableHostsCount;
private final AtomicInteger staleHostsCount;
private final Counter errorBudgetSuccess;
private final Counter errorBudgetFailure;
private long janitorStartTime;

public AgentJanitor(ServiceContext serviceContext, int minStaleHostThresholdSeconds,
Expand All @@ -63,6 +67,16 @@ public AgentJanitor(ServiceContext serviceContext, int minStaleHostThresholdSeco
maxLaunchLatencyThreshold = TimeUnit.SECONDS.toMillis(maxLaunchLatencyThresholdSeconds);
unreachableHostsCount = Metrics.gauge("unreachable_hosts", new AtomicInteger(0));
staleHostsCount = Metrics.gauge("stale_hosts", new AtomicInteger(0));

errorBudgetSuccess = Metrics.counter(MeterConstants.ERROR_BUDGET_METRIC_NAME,
MeterConstants.ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE,
MeterConstants.ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_SUCCESS,
MeterConstants.ERROR_BUDGET_TAG_NAME_METHOD_NAME, this.getClass().getSimpleName());

errorBudgetFailure = Metrics.counter(MeterConstants.ERROR_BUDGET_METRIC_NAME,
MeterConstants.ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE,
MeterConstants.ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_FAILURE,
MeterConstants.ERROR_BUDGET_TAG_NAME_METHOD_NAME, this.getClass().getSimpleName());
}

@Override
Expand All @@ -71,6 +85,7 @@ void processAllHosts() {
processStaleHosts();
determineStaleHostCandidates();
cleanUpAgentlessHosts();
errorBudgetSuccess.increment();
}

private Set<String> getTerminatedHostsFromSource(List<String> staleHostIds) {
Expand All @@ -82,6 +97,7 @@ private Set<String> getTerminatedHostsFromSource(List<String> staleHostIds) {
.getTerminatedHosts(staleHostIds.subList(i, Math.min(i + batchSize, staleHostIds.size()))));
} catch (Exception ex) {
LOG.error("Failed to get terminated hosts", ex);
errorBudgetFailure.increment();
}
}
return terminatedHosts;
Expand All @@ -94,6 +110,7 @@ private Long getInstanceLaunchGracePeriod(String clusterName) {
launchGracePeriod = rodimusManager.getClusterInstanceLaunchGracePeriod(clusterName);
} catch (Exception ex) {
LOG.error("failed to get launch grace period for cluster {}, exception: {}", clusterName, ex);
errorBudgetFailure.increment();
}
}
return launchGracePeriod == null ? maxLaunchLatencyThreshold : TimeUnit.SECONDS.toMillis(launchGracePeriod);
Expand All @@ -113,6 +130,8 @@ private boolean isHostStale(HostAgentBean hostAgentBean) {
hostBean = hostDAO.getHostsByHostId(hostAgentBean.getHost_id()).get(0);
} catch (Exception ex) {
LOG.error("failed to get host bean for ({}), {}", hostAgentBean, ex);
errorBudgetFailure.increment();

return false;
}

Expand Down Expand Up @@ -140,6 +159,7 @@ private Map<String, HostAgentBean> getStaleHostsMap(long minThreshold, long maxT
}
} catch (Exception ex) {
LOG.error("failed to get stale hosts", ex);
errorBudgetFailure.increment();
return staleHostMap;
}

Expand Down Expand Up @@ -171,6 +191,7 @@ private void determineStaleHostCandidates() {
HostAgentBean host = unreachableHostsMap.get(unreachableId);
LOG.info("{} has unreachable host {}", host.getAuto_scaling_group(), host.getHost_id());
}
errorBudgetSuccess.increment();
}
this.unreachableHostsCount.set(unreachableHostCount);
}
Expand All @@ -195,6 +216,7 @@ private void processStaleHosts() {
LOG.warn("{}:{} is stale (not Pinging Teletraan), but might be running.",
hostAgent.getAuto_scaling_group(), hostAgent.getHost_id());
staleHostCount++;
errorBudgetSuccess.increment();
} else {
LOG.debug("host {} is not stale", staleId);
}
Expand All @@ -217,6 +239,7 @@ private void cleanUpAgentlessHosts() {
agentlessHosts = hostDAO.getStaleAgentlessHostIds(noUpdateSince, agentlessHostBatchSize);
} catch (SQLException ex) {
LOG.error("failed to get agentless hosts", ex);
errorBudgetFailure.increment();
return;
}

Expand All @@ -226,6 +249,7 @@ private void cleanUpAgentlessHosts() {
removeStaleHost(hostId);
} else {
LOG.warn("Agentless host {} is stale but might be running", hostId);
errorBudgetSuccess.increment();
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
import com.pinterest.deployservice.dao.PromoteDAO;
import com.pinterest.deployservice.dao.UtilDAO;
import com.pinterest.deployservice.handler.DeployHandler;
import com.pinterest.deployservice.metrics.MeterConstants;

import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Metrics;

import com.google.common.base.Preconditions;
import org.apache.commons.lang.StringUtils;
Expand Down Expand Up @@ -59,6 +63,8 @@ public class AutoPromoter implements Runnable {
private BuildTagsManager buildTagsManager;
private int bufferTimeMinutes;
private final int maxCheckBuildsOrDeploys = 100;
private Counter errorBudgetSuccess;
private Counter errorBudgetFailure;


public AutoPromoter(ServiceContext serviceContext) {
Expand All @@ -70,6 +76,14 @@ public AutoPromoter(ServiceContext serviceContext) {
buildTagsManager = new BuildTagsManagerImpl(serviceContext.getTagDAO());
deployHandler = new DeployHandler(serviceContext);
bufferTimeMinutes = DEFAULT_BUFFER_TIME_MINUTE;

errorBudgetSuccess = Metrics.counter(MeterConstants.ERROR_BUDGET_METRIC_NAME,
MeterConstants.ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE, MeterConstants.ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_SUCCESS,
MeterConstants.ERROR_BUDGET_TAG_NAME_METHOD_NAME, this.getClass().getSimpleName());

errorBudgetFailure = Metrics.counter(MeterConstants.ERROR_BUDGET_METRIC_NAME,
MeterConstants.ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE, MeterConstants.ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_FAILURE,
MeterConstants.ERROR_BUDGET_TAG_NAME_METHOD_NAME, this.getClass().getSimpleName());
}

public AutoPromoter withBufferTimeMinutes(int bufferTime) {
Expand All @@ -82,16 +96,22 @@ void processBatch() throws Exception {
List<String> envIds = promoteDAO.getAutoPromoteEnvIds();
if (envIds.isEmpty()) {
LOG.debug("AutoPromoter did not find any valid env to work on, exiting.");

errorBudgetSuccess.increment();
return;
}
Collections.shuffle(envIds);
for (String envId : envIds) {
try {
LOG.debug("AutoPromoter chooses env {} to work on.", envId);
processOnce(envId);

errorBudgetSuccess.increment();
} catch (Throwable t) {
// Catch all throwable so that subsequent job not suppressed
LOG.error("AutoPromoter failed to process {}, Exception: {}", envId, t);

errorBudgetFailure.increment();
}
}
LOG.info("AutoPromoter processBatch finishes");
Expand Down Expand Up @@ -576,6 +596,8 @@ public void run() {
} catch (Throwable t) {
// Catch all throwable so that subsequent job not suppressed
LOG.error("Failed to call AutoPromoter.", t);

errorBudgetFailure.increment();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

import com.pinterest.deployservice.dao.BuildDAO;
import com.pinterest.deployservice.dao.UtilDAO;
import com.pinterest.deployservice.metrics.MeterConstants;
import com.pinterest.teletraan.TeletraanServiceContext;

import org.quartz.Job;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
Expand All @@ -29,6 +31,9 @@
import java.util.Collections;
import java.util.List;

import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Metrics;

/**
* Remove unused and old builds.
* <p>
Expand All @@ -39,6 +44,8 @@
public class BuildJanitor implements Job {
tylerwowen marked this conversation as resolved.
Show resolved Hide resolved
private static final Logger LOG = LoggerFactory.getLogger(BuildJanitor.class);
private static final long MILLIS_PER_DAY = 86400000;
private Counter errorBudgetSuccess;
private Counter errorBudgetFailure;

public BuildJanitor() {
// If using the Job interface, must keep constructor empty.
Expand Down Expand Up @@ -66,6 +73,9 @@ void processBuilds(TeletraanServiceContext workerContext) throws Exception {
LOG.info(String.format("Successfully removed builds: %s before %d milliseconds has %d.", buildName, timeThreshold, numToDelete));
} catch (Exception e) {
LOG.error("Failed to delete builds from tables.", e);

errorBudgetFailure.increment();

} finally {
utilDAO.releaseLock(buildLockName, connection);
LOG.info(String.format("DB lock operation is successful: release lock %s", buildLockName));
Expand All @@ -84,10 +94,24 @@ public void execute(JobExecutionContext context) throws JobExecutionException {
LOG.info("Start build janitor process...");
SchedulerContext schedulerContext = context.getScheduler().getContext();
TeletraanServiceContext workerContext = (TeletraanServiceContext) schedulerContext.get("serviceContext");

errorBudgetSuccess = Metrics.counter(MeterConstants.ERROR_BUDGET_METRIC_NAME,
MeterConstants.ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE, MeterConstants.ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_SUCCESS,
MeterConstants.ERROR_BUDGET_TAG_NAME_METHOD_NAME, this.getClass().getSimpleName());

errorBudgetFailure = Metrics.counter(MeterConstants.ERROR_BUDGET_METRIC_NAME,
MeterConstants.ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE, MeterConstants.ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_FAILURE,
MeterConstants.ERROR_BUDGET_TAG_NAME_METHOD_NAME, this.getClass().getSimpleName());

processBuilds(workerContext);
LOG.info("Stop build janitor process...");

errorBudgetSuccess.increment();
} catch (Throwable t) {
LOG.error("Failed to call build janitor.", t);

if(errorBudgetFailure != null)
errorBudgetFailure.increment();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import com.pinterest.deployservice.dao.DeployDAO;
import com.pinterest.deployservice.dao.EnvironDAO;
import com.pinterest.deployservice.dao.UtilDAO;
import com.pinterest.deployservice.metrics.MeterConstants;

import org.quartz.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -28,6 +30,9 @@
import java.util.Collections;
import java.util.List;

import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Metrics;

/**
* Removed unused/old deploys.
*/
Expand All @@ -38,6 +43,8 @@ public class DeployJanitor implements Job {
private EnvironDAO environDAO;
private DeployDAO deployDAO;
private UtilDAO utilDAO;
private Counter errorBudgetSuccess;
private Counter errorBudgetFailure;

public DeployJanitor() {
// If using the Job interface, must keep constructor empty.
Expand All @@ -62,8 +69,12 @@ void processDeploys() throws Exception {
deployDAO.deleteUnusedDeploys(envId, timeThreshold, numToDelete);
LOG.info(String.format("Successfully removed deploys: %s before %d milliseconds has %d.",
envId, timeThreshold, numToDelete));

errorBudgetSuccess.increment();
tylerwowen marked this conversation as resolved.
Show resolved Hide resolved
} catch (Exception e) {
LOG.error("Failed to delete builds from tables.", e);

errorBudgetFailure.increment();
} finally {
utilDAO.releaseLock(deployLockName, connection);
LOG.info(String.format("DB lock operation is successful: release lock %s", deployLockName));
Expand All @@ -79,10 +90,20 @@ void processDeploys() throws Exception {
public void execute(JobExecutionContext context) throws JobExecutionException {
SchedulerContext schedulerContext;

errorBudgetSuccess = Metrics.counter(MeterConstants.ERROR_BUDGET_METRIC_NAME,
MeterConstants.ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE, MeterConstants.ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_SUCCESS,
MeterConstants.ERROR_BUDGET_TAG_NAME_METHOD_NAME, this.getClass().getSimpleName());

errorBudgetFailure = Metrics.counter(MeterConstants.ERROR_BUDGET_METRIC_NAME,
MeterConstants.ERROR_BUDGET_TAG_NAME_RESPONSE_TYPE, MeterConstants.ERROR_BUDGET_TAG_VALUE_RESPONSE_TYPE_FAILURE,
MeterConstants.ERROR_BUDGET_TAG_NAME_METHOD_NAME, this.getClass().getSimpleName());

try {
schedulerContext = context.getScheduler().getContext();
} catch (SchedulerException e) {
LOG.error("Cannot retrive job context!", e);

errorBudgetFailure.increment();
return;
}

Expand All @@ -95,8 +116,12 @@ public void execute(JobExecutionContext context) throws JobExecutionException {
LOG.info("Start deploy janitor process...");
processDeploys();
LOG.info("Stop deploy janitor process...");

errorBudgetSuccess.increment();
} catch (Throwable t) {
LOG.error("Failed to call deploy janitor.", t);

errorBudgetFailure.increment();
}
}
}
Loading
Loading