未验证 提交 3d188e60 编写于 作者: wu-sheng's avatar wu-sheng 提交者: GitHub

Fix counter misuse in the alarm core. Alarm can't be triggered in time. (#7005)

* Fix counter misuse in the alarm core

* Submit changelog
上级 32718d1d
......@@ -51,6 +51,7 @@ Release Notes.
* Fix: `!=` is not supported in oal when parameters are numbers.
* Include events of the entity(s) in the alarm.
* Support `native-json` format log in kafka-fetcher-plugin.
* Fix counter misuse in the alarm core. Alarm can't be triggered in time.
#### UI
* Add logo for kong plugin.
......
......@@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import lombok.RequiredArgsConstructor;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
......@@ -107,7 +106,11 @@ public class RunningRule {
Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null;
this.formatter = new AlarmMessageFormatter(alarmRule.getMessage());
this.onlyAsCondition = alarmRule.isOnlyAsCondition();
this.tags = alarmRule.getTags().entrySet().stream().map(e -> new Tag(e.getKey(), e.getValue())).collect(Collectors.toList());
this.tags = alarmRule.getTags()
.entrySet()
.stream()
.map(e -> new Tag(e.getKey(), e.getValue()))
.collect(Collectors.toList());
}
/**
......@@ -146,12 +149,13 @@ public class RunningRule {
threshold.setType(MetricsValueType.MULTI_INTS);
} else if (metrics instanceof LabeledValueHolder) {
if (((LabeledValueHolder) metrics).getValue().keys().stream()
.noneMatch(label -> validate(
label,
includeLabels,
excludeLabels,
includeLabelsRegex,
excludeLabelsRegex))) {
.noneMatch(label -> validate(
label,
includeLabels,
excludeLabels,
includeLabelsRegex,
excludeLabelsRegex
))) {
return;
}
valueType = MetricsValueType.LABELED_LONG;
......@@ -169,11 +173,11 @@ public class RunningRule {
}
/**
* Validate target whether matching rules which is included list, excludes list, include regular expression
* or exclude regular expression.
* Validate target whether matching rules which is included list, excludes list, include regular expression or
* exclude regular expression.
*/
private boolean validate(String target, List<String> includeList, List<String> excludeList,
Pattern includeRegex, Pattern excludeRegex) {
Pattern includeRegex, Pattern excludeRegex) {
if (CollectionUtils.isNotEmpty(includeList)) {
if (!includeList.contains(target)) {
if (log.isTraceEnabled()) {
......@@ -256,7 +260,6 @@ public class RunningRule {
public class Window {
private LocalDateTime endTime;
private int period;
private int counter;
private int silenceCountdown;
private LinkedList<Metrics> values;
......@@ -266,7 +269,6 @@ public class RunningRule {
this.period = period;
// -1 means silence countdown is not running.
silenceCountdown = -1;
counter = 0;
init();
}
......@@ -320,7 +322,10 @@ public class RunningRule {
// too old data
// also should happen, but maybe if agent/probe mechanism time is not right.
if (log.isTraceEnabled()) {
log.trace("Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, values.size());
log.trace(
"Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime,
values.size()
);
}
return;
}
......@@ -338,12 +343,10 @@ public class RunningRule {
if (isMatch()) {
/*
* When
* 1. Metrics value threshold triggers alarm by rule
* 2. Counter reaches the count threshold;
* 3. Isn't in silence stage, judged by SilenceCountdown(!=0).
* 1. Alarm trigger conditions are satisfied.
* 2. Isn't in silence stage, judged by SilenceCountdown(!=0).
*/
counter++;
if (counter >= countThreshold && silenceCountdown < 1) {
if (silenceCountdown < 1) {
silenceCountdown = silencePeriod;
return Optional.of(new AlarmMessage());
} else {
......@@ -351,9 +354,6 @@ public class RunningRule {
}
} else {
silenceCountdown--;
if (counter > 0) {
counter--;
}
}
return Optional.empty();
}
......@@ -415,13 +415,14 @@ public class RunningRule {
DataTable values = ((LabeledValueHolder) metrics).getValue();
lexpected = RunningRule.this.threshold.getLongThreshold();
if (values.keys().stream().anyMatch(label ->
validate(
label,
RunningRule.this.includeLabels,
RunningRule.this.excludeLabels,
RunningRule.this.includeLabelsRegex,
RunningRule.this.excludeLabelsRegex)
&& op.test(lexpected, values.get(label)))) {
validate(
label,
RunningRule.this.includeLabels,
RunningRule.this.excludeLabels,
RunningRule.this.includeLabelsRegex,
RunningRule.this.excludeLabelsRegex
)
&& op.test(lexpected, values.get(label)))) {
matchCount++;
}
break;
......@@ -466,7 +467,9 @@ public class RunningRule {
break;
case LABELED_LONG:
DataTable dt = ((LabeledValueHolder) m).getValue();
TraceLogMetric l = new TraceLogMetric(m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()).toArray(new Number[0]));
TraceLogMetric l = new TraceLogMetric(
m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder())
.toArray(new Number[0]));
l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]);
r.add(l);
}
......
......@@ -19,11 +19,11 @@
package org.apache.skywalking.oap.server.core.alarm.provider;
import com.google.common.collect.Lists;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.HashMap;
import lombok.Getter;
import lombok.Setter;
import org.apache.skywalking.oap.server.core.Const;
......@@ -103,21 +103,16 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
// check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301442
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
// check at 201808301440
alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size());
Assert.assertEquals("Successful rate of endpoint Service_123 is lower than 75%", alarmMessages.get(0)
.getAlarmMessage());
}
@Test
......@@ -142,22 +137,18 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod1, 70, 60, 40, 40, 40));
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod2, 60, 60, 40, 40, 40));
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
// check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301442
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
// check at 201808301440
alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size());
Assert.assertEquals(
"response percentile of endpoint Service_123 is lower than expected values", alarmMessages.get(0)
.getAlarmMessage());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
}
@Test
......@@ -243,16 +234,18 @@ public class RunningRuleTest {
long timeInPeriod3 = 201808301438L;
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
// check at 201808301440
Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
// check at 201808301440
Assert.assertEquals(1, runningRule.check().size()); //alarm
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301442
Assert.assertNotEquals(0, runningRule.check().size()); //alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertNotEquals(0, runningRule.check().size()); //alarm
......@@ -304,7 +297,8 @@ public class RunningRuleTest {
alarmRule.setThreshold("1000");
alarmRule.setCount(1);
alarmRule.setPeriod(10);
alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setMessage(
"Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+");
alarmRule.setTags(new HashMap<String, String>() {{
put("key", "value");
......@@ -338,7 +332,8 @@ public class RunningRuleTest {
alarmRule.setThreshold("1000");
alarmRule.setCount(1);
alarmRule.setPeriod(10);
alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setMessage(
"Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+");
alarmRule.setTags(new HashMap<String, String>() {{
put("key", "value");
......@@ -603,22 +598,16 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11"));
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12"));
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
// check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301442
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
// check at 201808301440
alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size());
Assert.assertEquals(
"response percentile of endpoint Service_123 is lower than expected value", alarmMessages.get(0)
.getAlarmMessage());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册