From 68b918e3cd13463ebd6001689f3ce770475c057d Mon Sep 17 00:00:00 2001 From: Zhenxu Date: Thu, 27 May 2021 18:04:21 +0800 Subject: [PATCH] Events can be configured as alarm source (#7028) --- CHANGES.md | 1 + docs/en/concepts-and-designs/event.md | 48 ++++++++++++++++++- .../src/main/resources/alarm-settings.yml | 13 +++++ .../oap/server/core/event/Event.java | 27 ++++++++++- 4 files changed, 87 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index e22c381946..ea93875372 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -54,6 +54,7 @@ Release Notes. * Include events of the entity(s) in the alarm. * Support `native-json` format log in kafka-fetcher-plugin. * Fix counter misuse in the alarm core. Alarm can't be triggered in time. +* Events can be configured as alarm source. #### UI * Add logo for kong plugin. diff --git a/docs/en/concepts-and-designs/event.md b/docs/en/concepts-and-designs/event.md index e73d90b1ae..940a64109d 100644 --- a/docs/en/concepts-and-designs/event.md +++ b/docs/en/concepts-and-designs/event.md @@ -55,10 +55,56 @@ The end time of the event. This field may be empty if the event has not ended ye **NOTE:** When reporting an event, you typically call the report function twice, the first time for starting of the event and the second time for ending of the event, both with the same UUID. There are also cases where you would already have both the start time and end time. For example, when exporting events from a third-party system, the start time and end time are already known so you may simply call the report function once. +## How to Configure Alarms for Events + +Events are derived from metrics, and can be the source to trigger alarms. For example, if a specific event occurs for a +certain times in a period, alarms can be triggered and sent. + +Every event has a default `value = 1`, when `n` events with the same name are reported, they are aggregated +into `value = n` as follows. + +``` +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +``` + +will be aggregated into + +``` +Event{name=Unhealthy, source={service=A,instance=a}, ...} +``` + +so you can configure the following alarm rule to trigger alarm when `Unhealthy` event occurs more than 5 times within 10 +minutes. + +```yaml +rules: + unhealthy_event_rule: + metrics-name: Unhealthy + # Healthiness check is usually a scheduled task, + # they may be unhealthy for the first few times, + # and can be unhealthy occasionally due to network jitter, + # please adjust the threshold as per your actual situation. + threshold: 5 + op: ">" + period: 10 + count: 1 + message: Service instance has been unhealthy for 10 minutes +``` + +For more alarm configuration details, please refer to the [alarm doc](../setup/backend/backend-alarm.md). + +**Note** that the `Unhealthy` event above is only for demonstration, they are not detected by default in SkyWalking, +however, you can use the methods in [How to Report Events](#how-to-report-events) to report this kind of events. + ## Known Events | Name | Type | When | | :----: | :----: | :-----| | Start | Normal | When your Java Application starts with SkyWalking Agent installed, the `Start` Event will be created. | | Shutdown | Normal | When your Java Application stops with SkyWalking Agent installed, the `Shutdown` Event will be created. | -| Alarm | Error | When the Alarm is triggered, the corresponding `Alarm` Event will is created. | \ No newline at end of file +| Alarm | Error | When the Alarm is triggered, the corresponding `Alarm` Event will is created. | diff --git a/oap-server/server-bootstrap/src/main/resources/alarm-settings.yml b/oap-server/server-bootstrap/src/main/resources/alarm-settings.yml index 0efbe267ce..a255ef414c 100755 --- a/oap-server/server-bootstrap/src/main/resources/alarm-settings.yml +++ b/oap-server/server-bootstrap/src/main/resources/alarm-settings.yml @@ -40,6 +40,19 @@ rules: count: 1 tags: level: WARNING +# unhealthy_event_rule: +# metrics-name: Unhealthy + # Healthiness check is usually a scheduled task, + # they may be unhealthy for the first few times, + # and can be unhealthy occasionally due to network jitter, + # please adjust the threshold as per your actual situation. +# threshold: 5 +# op: ">" +# period: 10 +# count: 1 +# message: Service instance has been unhealthy for 10 minutes +# tags: +# level: ERROR webhooks: # - http://127.0.0.1/notify/ diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/event/Event.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/event/Event.java index 7065589511..37269be5ba 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/event/Event.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/event/Event.java @@ -24,15 +24,21 @@ import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.Setter; import org.apache.skywalking.apm.util.StringUtil; +import org.apache.skywalking.oap.server.core.analysis.IDManager; import org.apache.skywalking.oap.server.core.analysis.MetricsExtension; import org.apache.skywalking.oap.server.core.analysis.Stream; import org.apache.skywalking.oap.server.core.analysis.TimeBucket; +import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics; +import org.apache.skywalking.oap.server.core.analysis.metrics.MetricsMetaInfo; +import org.apache.skywalking.oap.server.core.analysis.metrics.WithMetadata; import org.apache.skywalking.oap.server.core.analysis.worker.MetricsStreamProcessor; import org.apache.skywalking.oap.server.core.remote.grpc.proto.RemoteData; +import org.apache.skywalking.oap.server.core.source.DefaultScopeDefine; import org.apache.skywalking.oap.server.core.source.ScopeDeclaration; import org.apache.skywalking.oap.server.core.storage.StorageHashMapBuilder; import org.apache.skywalking.oap.server.core.storage.annotation.Column; +import org.elasticsearch.common.Strings; import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.EVENT; @@ -45,7 +51,7 @@ import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.EV of = "uuid" ) @MetricsExtension(supportDownSampling = false, supportUpdate = true) -public class Event extends Metrics { +public class Event extends Metrics implements WithMetadata, LongValueHolder { public static final String INDEX_NAME = "events"; @@ -104,10 +110,14 @@ public class Event extends Metrics { @Column(columnName = END_TIME) private long endTime; + private transient long value = 1; + @Override public boolean combine(final Metrics metrics) { final Event event = (Event) metrics; + value++; + // Set time bucket only when it's never set. if (getTimeBucket() <= 0) { if (event.getStartTime() > 0) { @@ -193,6 +203,21 @@ public class Event extends Metrics { return hashCode(); } + @Override + public MetricsMetaInfo getMeta() { + int scope = DefaultScopeDefine.SERVICE; + final String serviceId = IDManager.ServiceID.buildId(getService(), true); + String id = serviceId; + if (!Strings.isNullOrEmpty(getServiceInstance())) { + scope = DefaultScopeDefine.SERVICE_INSTANCE; + id = IDManager.ServiceInstanceID.buildId(serviceId, getServiceInstance()); + } else if (!Strings.isNullOrEmpty(getEndpoint())) { + scope = DefaultScopeDefine.ENDPOINT; + id = IDManager.EndpointID.buildId(serviceId, getEndpoint()); + } + return new MetricsMetaInfo(getName(), scope, id); + } + public static class Builder implements StorageHashMapBuilder { @Override public Map entity2Storage(Event storageData) { -- GitLab