diff --git a/CHANGES.md b/CHANGES.md index feff8604fd6029099d1be756ffcbb94f7a583432..ea1334c5eb6e15d4ef82379fb749946c32077cf9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -51,6 +51,7 @@ Release Notes. * Fix: `!=` is not supported in oal when parameters are numbers. * Include events of the entity(s) in the alarm. * Support `native-json` format log in kafka-fetcher-plugin. +* Fix counter misuse in the alarm core. Alarm can't be triggered in time. #### UI * Add logo for kong plugin. diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index 5ebd951e213e2cbda94a22aafe3d533e4621ce33..5d7e148340ba0adc3796b42520f45d2206d3e7ee 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantLock; import java.util.regex.Pattern; import java.util.stream.Collectors; - import lombok.RequiredArgsConstructor; import lombok.ToString; import lombok.extern.slf4j.Slf4j; @@ -107,7 +106,11 @@ public class RunningRule { Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null; this.formatter = new AlarmMessageFormatter(alarmRule.getMessage()); this.onlyAsCondition = alarmRule.isOnlyAsCondition(); - this.tags = alarmRule.getTags().entrySet().stream().map(e -> new Tag(e.getKey(), e.getValue())).collect(Collectors.toList()); + this.tags = alarmRule.getTags() + .entrySet() + .stream() + .map(e -> new Tag(e.getKey(), e.getValue())) + .collect(Collectors.toList()); } /** @@ -146,12 +149,13 @@ public class RunningRule { threshold.setType(MetricsValueType.MULTI_INTS); } else if (metrics instanceof LabeledValueHolder) { if (((LabeledValueHolder) metrics).getValue().keys().stream() - .noneMatch(label -> validate( - label, - includeLabels, - excludeLabels, - includeLabelsRegex, - excludeLabelsRegex))) { + .noneMatch(label -> validate( + label, + includeLabels, + excludeLabels, + includeLabelsRegex, + excludeLabelsRegex + ))) { return; } valueType = MetricsValueType.LABELED_LONG; @@ -169,11 +173,11 @@ public class RunningRule { } /** - * Validate target whether matching rules which is included list, excludes list, include regular expression - * or exclude regular expression. + * Validate target whether matching rules which is included list, excludes list, include regular expression or + * exclude regular expression. */ private boolean validate(String target, List includeList, List excludeList, - Pattern includeRegex, Pattern excludeRegex) { + Pattern includeRegex, Pattern excludeRegex) { if (CollectionUtils.isNotEmpty(includeList)) { if (!includeList.contains(target)) { if (log.isTraceEnabled()) { @@ -256,7 +260,6 @@ public class RunningRule { public class Window { private LocalDateTime endTime; private int period; - private int counter; private int silenceCountdown; private LinkedList values; @@ -266,7 +269,6 @@ public class RunningRule { this.period = period; // -1 means silence countdown is not running. silenceCountdown = -1; - counter = 0; init(); } @@ -320,7 +322,10 @@ public class RunningRule { // too old data // also should happen, but maybe if agent/probe mechanism time is not right. if (log.isTraceEnabled()) { - log.trace("Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, values.size()); + log.trace( + "Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, + values.size() + ); } return; } @@ -338,12 +343,10 @@ public class RunningRule { if (isMatch()) { /* * When - * 1. Metrics value threshold triggers alarm by rule - * 2. Counter reaches the count threshold; - * 3. Isn't in silence stage, judged by SilenceCountdown(!=0). + * 1. Alarm trigger conditions are satisfied. + * 2. Isn't in silence stage, judged by SilenceCountdown(!=0). */ - counter++; - if (counter >= countThreshold && silenceCountdown < 1) { + if (silenceCountdown < 1) { silenceCountdown = silencePeriod; return Optional.of(new AlarmMessage()); } else { @@ -351,9 +354,6 @@ public class RunningRule { } } else { silenceCountdown--; - if (counter > 0) { - counter--; - } } return Optional.empty(); } @@ -415,13 +415,14 @@ public class RunningRule { DataTable values = ((LabeledValueHolder) metrics).getValue(); lexpected = RunningRule.this.threshold.getLongThreshold(); if (values.keys().stream().anyMatch(label -> - validate( - label, - RunningRule.this.includeLabels, - RunningRule.this.excludeLabels, - RunningRule.this.includeLabelsRegex, - RunningRule.this.excludeLabelsRegex) - && op.test(lexpected, values.get(label)))) { + validate( + label, + RunningRule.this.includeLabels, + RunningRule.this.excludeLabels, + RunningRule.this.includeLabelsRegex, + RunningRule.this.excludeLabelsRegex + ) + && op.test(lexpected, values.get(label)))) { matchCount++; } break; @@ -466,7 +467,9 @@ public class RunningRule { break; case LABELED_LONG: DataTable dt = ((LabeledValueHolder) m).getValue(); - TraceLogMetric l = new TraceLogMetric(m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()).toArray(new Number[0])); + TraceLogMetric l = new TraceLogMetric( + m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()) + .toArray(new Number[0])); l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]); r.add(l); } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java index a7229ec9a6740f0fe4fe20dc779babfad71ed392..a025aebec2345ab91aea78e12db32e5f3a9ed7ca 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java @@ -19,11 +19,11 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import com.google.common.collect.Lists; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.HashMap; import lombok.Getter; import lombok.Setter; import org.apache.skywalking.oap.server.core.Const; @@ -103,21 +103,16 @@ public class RunningRuleTest { runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); - runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); // check at 201808301440 List alarmMessages = runningRule.check(); Assert.assertEquals(0, alarmMessages.size()); - runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); - // check at 201808301441 - alarmMessages = runningRule.check(); - Assert.assertEquals(0, alarmMessages.size()); - runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442")); - // check at 201808301442 + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); + + // check at 201808301440 alarmMessages = runningRule.check(); Assert.assertEquals(1, alarmMessages.size()); - Assert.assertEquals("Successful rate of endpoint Service_123 is lower than 75%", alarmMessages.get(0) - .getAlarmMessage()); } @Test @@ -142,22 +137,18 @@ public class RunningRuleTest { runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod1, 70, 60, 40, 40, 40)); runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod2, 60, 60, 40, 40, 40)); - runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40)); // check at 201808301440 List alarmMessages = runningRule.check(); Assert.assertEquals(0, alarmMessages.size()); runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); - // check at 201808301441 - alarmMessages = runningRule.check(); - Assert.assertEquals(0, alarmMessages.size()); - runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442")); - // check at 201808301442 + + runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40)); + + // check at 201808301440 alarmMessages = runningRule.check(); Assert.assertEquals(1, alarmMessages.size()); - Assert.assertEquals( - "response percentile of endpoint Service_123 is lower than expected values", alarmMessages.get(0) - .getAlarmMessage()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); } @Test @@ -243,16 +234,18 @@ public class RunningRuleTest { long timeInPeriod3 = 201808301438L; runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); - runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); // check at 201808301440 Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); - // check at 201808301441 - Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm - runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442")); + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74)); + + // check at 201808301440 + Assert.assertEquals(1, runningRule.check().size()); //alarm + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); + // check at 201808301442 - Assert.assertNotEquals(0, runningRule.check().size()); //alarm Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm Assert.assertNotEquals(0, runningRule.check().size()); //alarm @@ -304,7 +297,8 @@ public class RunningRuleTest { alarmRule.setThreshold("1000"); alarmRule.setCount(1); alarmRule.setPeriod(10); - alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + alarmRule.setMessage( + "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+"); alarmRule.setTags(new HashMap() {{ put("key", "value"); @@ -338,7 +332,8 @@ public class RunningRuleTest { alarmRule.setThreshold("1000"); alarmRule.setCount(1); alarmRule.setPeriod(10); - alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + alarmRule.setMessage( + "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+"); alarmRule.setTags(new HashMap() {{ put("key", "value"); @@ -603,22 +598,16 @@ public class RunningRuleTest { runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11")); runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12")); - runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20")); - // check at 201808301440 List alarmMessages = runningRule.check(); Assert.assertEquals(0, alarmMessages.size()); runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); - // check at 201808301441 - alarmMessages = runningRule.check(); - Assert.assertEquals(0, alarmMessages.size()); - runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442")); - // check at 201808301442 + + runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20")); + + // check at 201808301440 alarmMessages = runningRule.check(); Assert.assertEquals(1, alarmMessages.size()); - Assert.assertEquals( - "response percentile of endpoint Service_123 is lower than expected value", alarmMessages.get(0) - .getAlarmMessage()); - + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); } }