Skip to content

Commit 3c0686e

Browse files
[cuebot] Fix exception handling in job completion metrics
- Remove try-catch from Prometheus metrics recording to allow programming errors (wrong labels) to fail loudly - Keep Kafka event publishing exception handling but properly log with stack trace for debugging transient failures
1 parent ba11dea commit 3c0686e

1 file changed

Lines changed: 10 additions & 14 deletions

File tree

cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -164,19 +164,15 @@ public boolean shutdownJob(JobInterface job, Source source, boolean isManualKill
164164

165165
// Record job completion metric
166166
if (prometheusMetrics != null && showDao != null) {
167-
try {
168-
JobDetail jobDetail = jobManager.getJobDetail(job.getId());
169-
String showName = showDao.getShowDetail(job.getShowId()).getName();
170-
String state = isManualKill ? "KILLED" : "FINISHED";
171-
prometheusMetrics.recordJobCompleted(state, showName, jobDetail.shot);
172-
173-
// Record job core seconds histogram
174-
ExecutionSummary execSummary = jobManager.getExecutionSummary(job);
175-
prometheusMetrics.recordJobCoreSeconds(execSummary.coreTime, showName,
176-
jobDetail.shot);
177-
} catch (Exception e) {
178-
logger.warn("Failed to record job completion metric: " + e.getMessage());
179-
}
167+
JobDetail jobDetail = jobManager.getJobDetail(job.getId());
168+
String showName = showDao.getShowDetail(job.getShowId()).getName();
169+
String state = isManualKill ? "KILLED" : "FINISHED";
170+
prometheusMetrics.recordJobCompleted(state, showName, jobDetail.shot);
171+
172+
// Record job core seconds histogram
173+
ExecutionSummary execSummary = jobManager.getExecutionSummary(job);
174+
prometheusMetrics.recordJobCoreSeconds(execSummary.coreTime, showName,
175+
jobDetail.shot);
180176
}
181177

182178
// Publish job completed/killed event to Kafka
@@ -190,7 +186,7 @@ public boolean shutdownJob(JobInterface job, Source source, boolean isManualKill
190186
jobDetail, previousState, null, null);
191187
kafkaEventPublisher.publishJobEvent(jobEvent);
192188
} catch (Exception e) {
193-
logger.warn("Failed to publish job event: " + e.getMessage());
189+
logger.warn("Failed to publish job event", e);
194190
}
195191
}
196192

0 commit comments

Comments
 (0)