未验证 提交 3d188e60 编写于 作者: wu-sheng's avatar wu-sheng 提交者: GitHub

Fix counter misuse in the alarm core. Alarm can't be triggered in time. (#7005)

* Fix counter misuse in the alarm core

* Submit changelog
上级 32718d1d
...@@ -51,6 +51,7 @@ Release Notes. ...@@ -51,6 +51,7 @@ Release Notes.
* Fix: `!=` is not supported in oal when parameters are numbers. * Fix: `!=` is not supported in oal when parameters are numbers.
* Include events of the entity(s) in the alarm. * Include events of the entity(s) in the alarm.
* Support `native-json` format log in kafka-fetcher-plugin. * Support `native-json` format log in kafka-fetcher-plugin.
* Fix counter misuse in the alarm core. Alarm can't be triggered in time.
#### UI #### UI
* Add logo for kong plugin. * Add logo for kong plugin.
......
...@@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap; ...@@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.ToString; import lombok.ToString;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
...@@ -107,7 +106,11 @@ public class RunningRule { ...@@ -107,7 +106,11 @@ public class RunningRule {
Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null; Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null;
this.formatter = new AlarmMessageFormatter(alarmRule.getMessage()); this.formatter = new AlarmMessageFormatter(alarmRule.getMessage());
this.onlyAsCondition = alarmRule.isOnlyAsCondition(); this.onlyAsCondition = alarmRule.isOnlyAsCondition();
this.tags = alarmRule.getTags().entrySet().stream().map(e -> new Tag(e.getKey(), e.getValue())).collect(Collectors.toList()); this.tags = alarmRule.getTags()
.entrySet()
.stream()
.map(e -> new Tag(e.getKey(), e.getValue()))
.collect(Collectors.toList());
} }
/** /**
...@@ -146,12 +149,13 @@ public class RunningRule { ...@@ -146,12 +149,13 @@ public class RunningRule {
threshold.setType(MetricsValueType.MULTI_INTS); threshold.setType(MetricsValueType.MULTI_INTS);
} else if (metrics instanceof LabeledValueHolder) { } else if (metrics instanceof LabeledValueHolder) {
if (((LabeledValueHolder) metrics).getValue().keys().stream() if (((LabeledValueHolder) metrics).getValue().keys().stream()
.noneMatch(label -> validate( .noneMatch(label -> validate(
label, label,
includeLabels, includeLabels,
excludeLabels, excludeLabels,
includeLabelsRegex, includeLabelsRegex,
excludeLabelsRegex))) { excludeLabelsRegex
))) {
return; return;
} }
valueType = MetricsValueType.LABELED_LONG; valueType = MetricsValueType.LABELED_LONG;
...@@ -169,11 +173,11 @@ public class RunningRule { ...@@ -169,11 +173,11 @@ public class RunningRule {
} }
/** /**
* Validate target whether matching rules which is included list, excludes list, include regular expression * Validate target whether matching rules which is included list, excludes list, include regular expression or
* or exclude regular expression. * exclude regular expression.
*/ */
private boolean validate(String target, List<String> includeList, List<String> excludeList, private boolean validate(String target, List<String> includeList, List<String> excludeList,
Pattern includeRegex, Pattern excludeRegex) { Pattern includeRegex, Pattern excludeRegex) {
if (CollectionUtils.isNotEmpty(includeList)) { if (CollectionUtils.isNotEmpty(includeList)) {
if (!includeList.contains(target)) { if (!includeList.contains(target)) {
if (log.isTraceEnabled()) { if (log.isTraceEnabled()) {
...@@ -256,7 +260,6 @@ public class RunningRule { ...@@ -256,7 +260,6 @@ public class RunningRule {
public class Window { public class Window {
private LocalDateTime endTime; private LocalDateTime endTime;
private int period; private int period;
private int counter;
private int silenceCountdown; private int silenceCountdown;
private LinkedList<Metrics> values; private LinkedList<Metrics> values;
...@@ -266,7 +269,6 @@ public class RunningRule { ...@@ -266,7 +269,6 @@ public class RunningRule {
this.period = period; this.period = period;
// -1 means silence countdown is not running. // -1 means silence countdown is not running.
silenceCountdown = -1; silenceCountdown = -1;
counter = 0;
init(); init();
} }
...@@ -320,7 +322,10 @@ public class RunningRule { ...@@ -320,7 +322,10 @@ public class RunningRule {
// too old data // too old data
// also should happen, but maybe if agent/probe mechanism time is not right. // also should happen, but maybe if agent/probe mechanism time is not right.
if (log.isTraceEnabled()) { if (log.isTraceEnabled()) {
log.trace("Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, values.size()); log.trace(
"Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime,
values.size()
);
} }
return; return;
} }
...@@ -338,12 +343,10 @@ public class RunningRule { ...@@ -338,12 +343,10 @@ public class RunningRule {
if (isMatch()) { if (isMatch()) {
/* /*
* When * When
* 1. Metrics value threshold triggers alarm by rule * 1. Alarm trigger conditions are satisfied.
* 2. Counter reaches the count threshold; * 2. Isn't in silence stage, judged by SilenceCountdown(!=0).
* 3. Isn't in silence stage, judged by SilenceCountdown(!=0).
*/ */
counter++; if (silenceCountdown < 1) {
if (counter >= countThreshold && silenceCountdown < 1) {
silenceCountdown = silencePeriod; silenceCountdown = silencePeriod;
return Optional.of(new AlarmMessage()); return Optional.of(new AlarmMessage());
} else { } else {
...@@ -351,9 +354,6 @@ public class RunningRule { ...@@ -351,9 +354,6 @@ public class RunningRule {
} }
} else { } else {
silenceCountdown--; silenceCountdown--;
if (counter > 0) {
counter--;
}
} }
return Optional.empty(); return Optional.empty();
} }
...@@ -415,13 +415,14 @@ public class RunningRule { ...@@ -415,13 +415,14 @@ public class RunningRule {
DataTable values = ((LabeledValueHolder) metrics).getValue(); DataTable values = ((LabeledValueHolder) metrics).getValue();
lexpected = RunningRule.this.threshold.getLongThreshold(); lexpected = RunningRule.this.threshold.getLongThreshold();
if (values.keys().stream().anyMatch(label -> if (values.keys().stream().anyMatch(label ->
validate( validate(
label, label,
RunningRule.this.includeLabels, RunningRule.this.includeLabels,
RunningRule.this.excludeLabels, RunningRule.this.excludeLabels,
RunningRule.this.includeLabelsRegex, RunningRule.this.includeLabelsRegex,
RunningRule.this.excludeLabelsRegex) RunningRule.this.excludeLabelsRegex
&& op.test(lexpected, values.get(label)))) { )
&& op.test(lexpected, values.get(label)))) {
matchCount++; matchCount++;
} }
break; break;
...@@ -466,7 +467,9 @@ public class RunningRule { ...@@ -466,7 +467,9 @@ public class RunningRule {
break; break;
case LABELED_LONG: case LABELED_LONG:
DataTable dt = ((LabeledValueHolder) m).getValue(); DataTable dt = ((LabeledValueHolder) m).getValue();
TraceLogMetric l = new TraceLogMetric(m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()).toArray(new Number[0])); TraceLogMetric l = new TraceLogMetric(
m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder())
.toArray(new Number[0]));
l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]); l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]);
r.add(l); r.add(l);
} }
......
...@@ -19,11 +19,11 @@ ...@@ -19,11 +19,11 @@
package org.apache.skywalking.oap.server.core.alarm.provider; package org.apache.skywalking.oap.server.core.alarm.provider;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import java.util.HashMap;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.HashMap;
import lombok.Getter; import lombok.Getter;
import lombok.Setter; import lombok.Setter;
import org.apache.skywalking.oap.server.core.Const; import org.apache.skywalking.oap.server.core.Const;
...@@ -103,21 +103,16 @@ public class RunningRuleTest { ...@@ -103,21 +103,16 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
// check at 201808301440 // check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check(); List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size()); Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441 runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size()); // check at 201808301440
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301442
alarmMessages = runningRule.check(); alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size()); Assert.assertEquals(1, alarmMessages.size());
Assert.assertEquals("Successful rate of endpoint Service_123 is lower than 75%", alarmMessages.get(0)
.getAlarmMessage());
} }
@Test @Test
...@@ -142,22 +137,18 @@ public class RunningRuleTest { ...@@ -142,22 +137,18 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod1, 70, 60, 40, 40, 40)); runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod1, 70, 60, 40, 40, 40));
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod2, 60, 60, 40, 40, 40)); runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod2, 60, 60, 40, 40, 40));
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
// check at 201808301440 // check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check(); List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size()); Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
alarmMessages = runningRule.check(); runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442")); // check at 201808301440
// check at 201808301442
alarmMessages = runningRule.check(); alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size()); Assert.assertEquals(1, alarmMessages.size());
Assert.assertEquals( runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
"response percentile of endpoint Service_123 is lower than expected values", alarmMessages.get(0)
.getAlarmMessage());
} }
@Test @Test
...@@ -243,16 +234,18 @@ public class RunningRuleTest { ...@@ -243,16 +234,18 @@ public class RunningRuleTest {
long timeInPeriod3 = 201808301438L; long timeInPeriod3 = 201808301438L;
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71)); runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
// check at 201808301440 // check at 201808301440
Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301440
Assert.assertEquals(1, runningRule.check().size()); //alarm
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301442 // check at 201808301442
Assert.assertNotEquals(0, runningRule.check().size()); //alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertNotEquals(0, runningRule.check().size()); //alarm Assert.assertNotEquals(0, runningRule.check().size()); //alarm
...@@ -304,7 +297,8 @@ public class RunningRuleTest { ...@@ -304,7 +297,8 @@ public class RunningRuleTest {
alarmRule.setThreshold("1000"); alarmRule.setThreshold("1000");
alarmRule.setCount(1); alarmRule.setCount(1);
alarmRule.setPeriod(10); alarmRule.setPeriod(10);
alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); alarmRule.setMessage(
"Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+"); alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+");
alarmRule.setTags(new HashMap<String, String>() {{ alarmRule.setTags(new HashMap<String, String>() {{
put("key", "value"); put("key", "value");
...@@ -338,7 +332,8 @@ public class RunningRuleTest { ...@@ -338,7 +332,8 @@ public class RunningRuleTest {
alarmRule.setThreshold("1000"); alarmRule.setThreshold("1000");
alarmRule.setCount(1); alarmRule.setCount(1);
alarmRule.setPeriod(10); alarmRule.setPeriod(10);
alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); alarmRule.setMessage(
"Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+"); alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+");
alarmRule.setTags(new HashMap<String, String>() {{ alarmRule.setTags(new HashMap<String, String>() {{
put("key", "value"); put("key", "value");
...@@ -603,22 +598,16 @@ public class RunningRuleTest { ...@@ -603,22 +598,16 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11")); runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11"));
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12")); runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12"));
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
// check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check(); List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size()); Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
alarmMessages = runningRule.check(); runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442")); // check at 201808301440
// check at 201808301442
alarmMessages = runningRule.check(); alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size()); Assert.assertEquals(1, alarmMessages.size());
Assert.assertEquals( runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
"response percentile of endpoint Service_123 is lower than expected value", alarmMessages.get(0)
.getAlarmMessage());
} }
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册