Note: this isn't really thread-safe; it's subject to race conditions, unless used very - * carefully. Accordingly, usage of this field SHALL conform to the following - * rules: - * - *
Following these rules will result in {@link ConcurrentMap#size()} being eventually
- * consistent but nevertheless ALWAYS >= the number of jobs that are actually enqueued, from the
- * view of the {@link VolunteerJob}'s thread. This aligns with what the {@link VolunteerJob} needs
- * the value for, as we're using it to avoid over-subscribing on work. Under-subscribing is fine,
- * in the context.
+ * Any exception generated by a job will halt processing and is stored here. Access limited to
+ * synchronized methods.
*/
- private final ConcurrentMap External causes like jobs completing on their own or calls to {@link #stop} from the
+ * shutdown handler will cause the pool to shut down gracefully while we wait and thus allow us to
+ * return.
*/
- public PipelineManager(
- MetricRegistry appMetrics,
- PipelineJobRecordStore jobRecordStore,
- S3TaskManager s3TaskManagerHandle) {
- this.appMetrics = appMetrics;
- this.jobRecordStore = jobRecordStore;
- this.jobExecutor = createJobExecutor();
- this.jobsEnqueuedHandles = new ConcurrentHashMap<>();
- this.jobMonitorsExecutor = Executors.newCachedThreadPool();
- this.jobRegistry = new HashMap<>();
- this.s3TaskManagerHandle = s3TaskManagerHandle;
+ public void awaitCompletion() {
+ // Calls to the latch are automatically synchronized on the latch.
+ while (latch.getCount() > 0) {
+ try {
+ latch.await();
+ } catch (InterruptedException ex) {
+ log.debug("caught interrupt - still waiting for latch to reach zero");
+ }
+ }
- /*
- * Bootstrap the SchedulerJob and VolunteerJob, which are responsible for ensuring that all of
- * the other jobs get executed, as and when needed. Note that it will permanently tie up two of
- * the job executors, as they're designed to run forever.
- */
- VolunteerJob volunteerJob = new VolunteerJob(appMetrics, this, jobRecordStore);
- registerJob(volunteerJob);
- PipelineJobRecord This property, and its invariants, allow us to avoid over-committing and accepting more work
- * than we are guaranteed to have {@link #jobExecutor} threads/slots available for. This isn't
- * strictly necessary if jobs are only running on a single node, but is nevertheless a nice
- * property, and becomes very important if jobs are being run across multiple nodes (as otherwise
- * we'd have unnecessarily stalled work).
+ * {@inheritDoc}
*
- * @return the number of available executor slots (technically, an eventually consistent
- * approximation of that value, which is guaranteed to be less-than-or-equal-to the true
- * value, provided that the thread-safety rules described in {@link #jobsEnqueuedHandles} are
- * adhered to)
+ * @return true if OK for job to run
*/
- public int getOpenExecutorSlots() {
- return JOB_EXECUTOR_THREADS - jobsEnqueuedHandles.size();
+ @Override
+ public synchronized boolean jobsCanRun() {
+ return isRunning && error == null;
}
/**
- * Gets the set of jobs that have registered schedules.
+ * Just logs the event and returns a unique id.
*
- * @return the {@link Set} of jobs registered via {@link #registerJob(PipelineJob)} that have
- * {@link PipelineJob#getSchedule()} values
- */
- @SuppressWarnings({"unchecked", "rawtypes"})
- public Set {@inheritDoc}
*
- * @param the type parameter
- * @param jobRecord the {@link PipelineJobRecord} of the job to run
- * @return {@inheritDoc}
*
- * @param jobRecordId the {@link PipelineJobRecord} of the job
- * @param exception The exception from the job failure
+ * @param summary summaries the outcome of the run
*/
- private void handleJobFailure(PipelineJobRecordId jobRecordId, Exception exception) {
- synchronized (jobsEnqueuedHandles) {
- if (jobsEnqueuedHandles.containsKey(jobRecordId)) {
- jobRecordStore.recordJobFailure(jobRecordId, new PipelineJobFailure(exception));
- jobsEnqueuedHandles.remove(jobRecordId);
- }
- LOGGER.error("Job failure in Pipeline: " + exception.getMessage(), exception);
+ @Override
+ public synchronized void completedRun(PipelineJobRunner.JobRunSummary summary) {
+ log.info("job run complete: {}", summary);
+ if (completedJobs.size() > MAX_COMPLETED_JOBS) {
+ completedJobs.removeFirst();
}
+ completedJobs.addLast(summary);
}
/**
- * Handle job cancellation by de-queueing and recording cancellation.
+ * Just logs the event.
*
- * @param jobRecordId the {@link PipelineJobRecord} of the job
+ * {@inheritDoc}
+ *
+ * @param job the job that is sleeping
*/
- private void handleJobCancellation(PipelineJobRecordId jobRecordId) {
- synchronized (jobsEnqueuedHandles) {
- if (jobsEnqueuedHandles.containsKey(jobRecordId)) {
- jobRecordStore.recordJobCancellation(jobRecordId);
- jobsEnqueuedHandles.remove(jobRecordId);
- }
- }
+ @Override
+ public void sleeping(PipelineJob job) {
+ log.debug("Job sleeping: type={}", job.getType());
}
/**
- * Handle normal job completion by de-queueing and recording completion.
+ * Just logs the event.
+ *
+ * {@inheritDoc}
*
- * @param jobRecordId the {@link PipelineJobRecord} of the job
- * @param jobOutcome the outcome of the job to record
+ * @param job the job that is stopping
*/
- private void handleJobCompletion(PipelineJobRecordId jobRecordId, PipelineJobOutcome jobOutcome) {
- synchronized (jobsEnqueuedHandles) {
- if (jobsEnqueuedHandles.containsKey(jobRecordId)) {
- jobRecordStore.recordJobCompletion(jobRecordId, jobOutcome);
- jobsEnqueuedHandles.remove(jobRecordId);
- }
- }
+ @Override
+ public void stoppingDueToInterrupt(PipelineJob job) {
+ log.info("Job interrupted: type={}", job.getType());
}
/**
- * This will eventually end all jobs and shut down this {@link PipelineManager}. Note: not all
- * jobs support being stopped while in progress, so this method may block for quite a while.
+ * Saves the exception for reporting later. This will also prevent other jobs from running so that
+ * the pipeline can shut down gracefully without a call to {@link System#exit}.
+ *
+ * {@inheritDoc}
+ *
+ * @param job the job that is stopping
+ * @param exception the exception that was thrown
*/
- public void stop() {
- // If something has already shut us down, we're done.
- if (jobExecutor.isShutdown()) {
- return;
- }
-
- Timer.Context timerStop =
- appMetrics.timer(MetricRegistry.name(getClass().getSimpleName(), "stop")).time();
- LOGGER.info("Stopping PipelineManager...");
-
- /*
- * Tell the job executor to shut down, which will prevent it from accepting new jobs and from
- * running any jobs that haven't already started. If all jobs are interruptible, we'll shut it
- * down _harder_, such that in-progress job threads get interrupted (ala Thread.interrupt()).
- */
- boolean unsafeToInterrupt = jobRegistry.values().stream().anyMatch(j -> !j.isInterruptible());
- if (unsafeToInterrupt) {
- jobExecutor.shutdown();
- LOGGER.info("Shut down job executor, without cancelling existing jobs.");
+ @Override
+ public synchronized void stoppingDueToException(PipelineJob job, Exception exception) {
+ log.error("Job execution failed: type={} exception={}", job.getType(), exception.getMessage());
+ if (this.error == null) {
+ this.error = exception;
} else {
- jobExecutor.shutdownNow();
- LOGGER.info("Shut down job executor, cancelling existing jobs.");
- }
-
- LOGGER.info("Attempting to shut down interruptable jobs...");
-
- /*
- * Try to stop all jobs that are either not running yet or are interruptible. Note: VolunteerJob
- * might still be trying to submit jobs over on its thread, so we synchronize to keep things
- * consistent and ensure we don't miss any jobs.
- */
- synchronized (jobsEnqueuedHandles) {
- jobsEnqueuedHandles.values().parallelStream()
- .forEach(
- j -> {
- LOGGER.info(" Attempting to cancel " + j.job.getType());
- /*
- * Note: There's a race condition here, where the job may have completed just before
- * we try to cancel it, but that's okay because Future.cancel(...) is basically a
- * no-op for jobs that have already completed.
- */
- j.cancelIfInterruptible();
- });
- }
-
- LOGGER.info("Cancelled all interruptable jobs.");
-
- // Clean up any pending S3 operations and shut down the S3 manager (will be null if not a CCW
- // pipeline)
- if (s3TaskManagerHandle != null) {
- s3TaskManagerHandle.shutdownSafely();
+ this.error.addSuppressed(exception);
}
-
- /*
- * Wait for everything to halt.
- */
- boolean allStopped = jobExecutor.isTerminated();
- Optional {@inheritDoc}
*
- * @param the {@link PipelineJobArguments} type associated with this {@link PipelineJob}
- * implementation (see {@link NullPipelineJobArguments} for those {@link PipelineJob}
- * implementations which do not need arguments)
+ * @param job the job that is stopping
*/
- private static final class PipelineJobHandle {
- /** The {@link PipelineJob} that the paired {@link Future} is for. */
- private final PipelineJob job;
- /** The {@link Future} representing an execution of the paired {@link PipelineJob}. */
- private final Future Design Note: If we ever move to an autoscaled version of this application, it will be
- * important to ensure that this job is only running once across its environment, to avoid duplicate
- * job schedule triggers. To that end, this job will get run just like any other job (via the {@link
- * PipelineJobRecordStore} and the {@link VolunteerJob}). To ensure that it gets kicked off, the
- * {@link PipelineJobRecordStore} has a permanently uncompleted {@link PipelineJobRecord} for this:
- * the job will always be running and runs a scheduling loop internally.
- */
-public final class SchedulerJob implements PipelineJob Note: this "constant" is actually mutable, but should only ever be modified by tests.
- */
- public static long SCHEDULER_TICK_MILLIS = 10 * 1000;
-
- /** The metrics for this job. */
- private final MetricRegistry appMetrics;
- /** The orchestration object for the pipeline. */
- private final PipelineManager pipelineManager;
- /** Holds the records of completed jobs. */
- private final PipelineJobRecordStore jobRecordsStore;
-
- /**
- * Constructs the {@link SchedulerJob}, which should be a singleton within the application
- * environment.
- *
- * @param appMetrics the {@link MetricRegistry} for the overall application
- * @param pipelineManager the {@link PipelineManager} that jobs should be run on
- * @param jobRecordsStore the {@link PipelineJobRecordStore} tracking jobs that have been
- * submitted for execution
- */
- public SchedulerJob(
- MetricRegistry appMetrics,
- PipelineManager pipelineManager,
- PipelineJobRecordStore jobRecordsStore) {
- this.appMetrics = appMetrics;
- this.pipelineManager = pipelineManager;
- this.jobRecordsStore = jobRecordsStore;
- }
-
- /** {@inheritDoc} */
- @Override
- public Optional Note: this "constant" is actually mutable, but should only ever be modified by tests.
- */
- public static long VOLUNTEER_TICK_MILLIS = 10 * 1000;
-
- /** The metrics for this job. */
- private final MetricRegistry appMetrics;
- /** The orchestration object for the pipeline. */
- private final PipelineManager pipelineManager;
- /** Holds the records of completed jobs. */
- private final PipelineJobRecordStore jobRecordsStore;
-
- /**
- * Constructs the {@link VolunteerJob}, which should be a singleton within its JVM.
- *
- * @param appMetrics the {@link MetricRegistry} for the overall application
- * @param pipelineManager the {@link PipelineManager} that jobs should be run on
- * @param jobRecordsStore the {@link PipelineJobRecordStore} tracking jobs that have been
- * submitted for execution
- */
- public VolunteerJob(
- MetricRegistry appMetrics,
- PipelineManager pipelineManager,
- PipelineJobRecordStore jobRecordsStore) {
- this.appMetrics = appMetrics;
- this.pipelineManager = pipelineManager;
- this.jobRecordsStore = jobRecordsStore;
- }
-
- /** {@inheritDoc} */
- @Override
- public Optional This is intentionally left ignored most of the time, so as to not slow down our builds. It
- * should only be run if/when someone is looking into performance issues.
+ * Verifies that {@link PipelineManager#stop()} works, as expected.
*
- * @throws Exception Any unhandled {@link Exception}s will cause this test case to fail.
+ * @throws Exception pass through if test throws
*/
@Test
- @Disabled
- public void runWayTooManyJobsThenStop() throws Exception {
- // Let's speed things up a bit, so we can run more iterations, faster.
- SchedulerJob.SCHEDULER_TICK_MILLIS = 1;
- VolunteerJob.VOLUNTEER_TICK_MILLIS = 1;
-
- MetricRegistry appMetrics = new MetricRegistry();
- Slf4jReporter.forRegistry(appMetrics).outputTo(LOGGER).build().start(30, TimeUnit.SECONDS);
-
- // Create the pipeline.
- PipelineJobRecordStore jobRecordStore = new PipelineJobRecordStore(appMetrics);
- try (PipelineManager pipelineManager =
- new PipelineManager(appMetrics, jobRecordStore, mockS3TaskManager)) {
- // Register a mock unscheduled job.
- MockJob mockUnscheduledJob =
- new MockJob(
- Optional.empty(),
- () -> {
- return PipelineJobOutcome.WORK_DONE;
- });
- pipelineManager.registerJob(mockUnscheduledJob);
-
- // Register a second scheduled job.
- MockJob mockScheduledJob =
- new MockJob(
- Optional.of(new PipelineJobSchedule(1, ChronoUnit.MILLIS)),
- () -> {
- return PipelineJobOutcome.WORK_DONE;
- }) {
- /*
- * Very hacky, but here we're extending MockJob with an anonymous class that has a
- * different getType() value.
- */
-
- /**
- * @see gov.cms.bfd.pipeline.app.PipelineManagerIT.MockJob#getType()
- */
- @Override
- public PipelineJobTypetrue
if the specified job was enqueued, or false
if it could
- * not be (e.g. because {@link #stop()} has been called)
+ * @param job the job that is starting
+ * @return job id
*/
- public boolean enqueueJob(PipelineJobRecord jobRecord) {
- Timer.Context timerEnqueue =
- appMetrics.timer(MetricRegistry.name(getClass().getSimpleName(), "enqueue")).time();
-
- // First, find the specified job.
- @SuppressWarnings("unchecked")
- PipelineJob job = (PipelineJob) jobRegistry.get(jobRecord.getJobType());
- if (job == null)
- throw new IllegalArgumentException(
- String.format("Unknown or unregistered job type '%s'.", jobRecord.getJobType()));
-
- // Submit the job to be run!
- PipelineJobWrapper jobWrapper;
- ListenableFuturetrue
if the application output indicates that data set scanning has
- * started, false
if not
- */
- private static boolean hasRdaFissLoadJobFailed(ProcessOutputConsumer appRunConsumer) {
- return hasJobRecordMatching(
- appRunConsumer,
- PipelineJobRecordStore.LOG_MESSAGE_PREFIX_JOB_FAILED,
- RdaFissClaimLoadJob.class);
- }
-
- /**
- * Checks if the RDA MCS load job has failed by checking the job records.
- *
- * @param appRunConsumer the {@link ProcessOutputConsumer} whose output should be checked
- * @return true
if the application output indicates that data set scanning has
- * started, false
if not
- */
- private static boolean hasRdaMcsLoadJobFailed(ProcessOutputConsumer appRunConsumer) {
- return hasJobRecordMatching(
- appRunConsumer,
- PipelineJobRecordStore.LOG_MESSAGE_PREFIX_JOB_FAILED,
- RdaMcsClaimLoadJob.class);
+ appRunConsumer, PipelineJobRunner.JobRunSummary::isFailureString, CcwRifLoadJob.class);
}
/**
- * Checks if a job has a job record matching a specified value.
+ * Checks if a job has a job record matching a specified predicate.
*
* @param appRunConsumer the job to check
- * @param prefix the record prefix type to check for
- * @param klass the class of the job to check
- * @return {@code true} if the job had a record matching the specified prefix type
+ * @param matcher {@link Predicate} used to find a target string
+ * @param klass used to verify a target string contains the class name
+ * @return {@code true} if the job had a record matching the specified predicate and class name
*/
private static boolean hasJobRecordMatching(
- ProcessOutputConsumer appRunConsumer, String prefix, Class> klass) {
+ ProcessOutputConsumer appRunConsumer, Predicate