Support k8s monitoring (#6479)

f9096f50 · wankai123 · GitHub · 96611394 · f9096f50 · f9096f50
9 changed file
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -68,6 +68,8 @@ Release Notes.
 * Optimize the self monitoring grafana dashboard.
 * Enhance the export service.
 * Add function `retagByK8sMeta` and opt type `K8sRetagType.Pod2Service` in MAL for k8s to relate pods and services.
+* Using "service.istio.io/canonical-name" to replace "app" label to resolve Envoy ALS service name.
+* Support k8s monitoring.
 * Make the flushing metrics operation concurrent.
 * Fix ALS K8SServiceRegistry didn't remove the correct entry.
 * Using "service.istio.io/canonical-name" to replace "app" label to resolve Envoy ALS service name.

--- a/docs/en/setup/backend/backend-receivers.md
+++ b/docs/en/setup/backend/backend-receivers.md
@@ -132,6 +132,9 @@ to be the identification of the metric data.
 |istio-controlplane| Metrics of Istio control panel | otel-oc-rules/istio-controlplane.yaml | Istio Control Panel -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
 |oap| Metrics of SkyWalking OAP server itself | otel-oc-rules/oap.yaml | SkyWalking OAP Server(SelfObservability) -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
 |vm| Metrics of VMs | otel-oc-rules/vm.yaml | Prometheus node-exporter(VMs) -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
+|k8s-cluster| Metrics of K8s cluster | otel-oc-rules/k8s-cluster.yaml | K8s kube-state-metrics -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
+|k8s-node| Metrics of K8s cluster | otel-oc-rules/k8s-node.yaml | cAdvisor & K8s kube-state-metrics -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |
+|k8s-service| Metrics of K8s cluster | otel-oc-rules/k8s-service.yaml | cAdvisor & K8s kube-state-metrics -> OpenTelemetry Collector --OC format--> SkyWalking OAP Server |

 ## Meter receiver


--- a/oap-server/analyzer/meter-analyzer/src/main/java/org/apache/skywalking/oap/meter/analyzer/dsl/tagOpt/K8sRetagType.java
+++ b/oap-server/analyzer/meter-analyzer/src/main/java/org/apache/skywalking/oap/meter/analyzer/dsl/tagOpt/K8sRetagType.java
@@ -27,7 +27,6 @@ import org.apache.skywalking.oap.meter.analyzer.dsl.Sample;
 import org.apache.skywalking.oap.meter.analyzer.k8s.K8sInfoRegistry;

 public enum K8sRetagType implements Retag {
-
    Pod2Service {
        @Override
        public Sample[] execute(final Sample[] ss,
@@ -39,11 +38,12 @@ public enum K8sRetagType implements Retag {
                String namespace = sample.getLabels().get(namespaceLabelName);
                if (!Strings.isNullOrEmpty(podName) && !Strings.isNullOrEmpty(namespace)) {
                    String serviceName = K8sInfoRegistry.getInstance().findServiceName(namespace, podName);
-                    if (!Strings.isNullOrEmpty(serviceName)) {
-                        Map<String, String> labels = Maps.newHashMap(sample.getLabels());
-                        labels.put(newLabelName, serviceName);
-                        return sample.toBuilder().labels(ImmutableMap.copyOf(labels)).build();
+                    if (Strings.isNullOrEmpty(serviceName)) {
+                        serviceName = BLANK;
                    }
+                    Map<String, String> labels = Maps.newHashMap(sample.getLabels());
+                    labels.put(newLabelName, serviceName);
+                    return sample.toBuilder().labels(ImmutableMap.copyOf(labels)).build();
                }
                return sample;
            }).toArray(Sample[]::new);

--- a/oap-server/analyzer/meter-analyzer/src/main/java/org/apache/skywalking/oap/meter/analyzer/dsl/tagOpt/Retag.java
+++ b/oap-server/analyzer/meter-analyzer/src/main/java/org/apache/skywalking/oap/meter/analyzer/dsl/tagOpt/Retag.java
@@ -21,5 +21,6 @@ package org.apache.skywalking.oap.meter.analyzer.dsl.tagOpt;
 import org.apache.skywalking.oap.meter.analyzer.dsl.Sample;

 public interface Retag {
+    String BLANK = "";
    Sample[] execute(Sample[] ss, String newLabelName, String existingLabelName, String namespaceLabelName);
 }
--- a/oap-server/analyzer/meter-analyzer/src/test/java/org/apache/skywalking/oap/meter/analyzer/dsl/K8sTagTest.java
+++ b/oap-server/analyzer/meter-analyzer/src/test/java/org/apache/skywalking/oap/meter/analyzer/dsl/K8sTagTest.java
@@ -28,6 +28,7 @@ import java.util.Collection;
 import java.util.Map;
 import lombok.SneakyThrows;
 import lombok.extern.slf4j.Slf4j;
+import org.apache.skywalking.oap.meter.analyzer.dsl.tagOpt.Retag;
 import org.apache.skywalking.oap.meter.analyzer.k8s.K8sInfoRegistry;
 import org.junit.Before;
 import org.junit.Test;
@@ -133,7 +134,7 @@ public class K8sTagTest {
                          .labels(
                              of(
                                  "namespace", "default", "container", "my-nginx", "cpu", "total", "pod",
-                                  "my-nginx-5dc4865748-no-pod"
+                                  "my-nginx-5dc4865748-no-pod" , "service", Retag.BLANK
                              ))
                          .value(2)
                          .build(),
@@ -175,7 +176,7 @@ public class K8sTagTest {
                          .labels(
                              of(
                                  "namespace", "default", "container", "my-nginx", "cpu", "total", "pod",
-                                  "my-nginx-5dc4865748-no-service"
+                                  "my-nginx-5dc4865748-no-service" , "service", Retag.BLANK
                              ))
                          .value(2)
                          .build(),

--- a/oap-server/server-bootstrap/src/main/resources/otel-oc-rules/k8s-cluster.yaml
+++ b/oap-server/server-bootstrap/src/main/resources/otel-oc-rules/k8s-cluster.yaml
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This will parse a textual representation of a duration. The formats
+# accepted are based on the ISO-8601 duration format {@code PnDTnHnMn.nS}
+# with days considered to be exactly 24 hours.
+# <p>
+# Examples:
+# <pre>
+#    "PT20.345S" -- parses as "20.345 seconds"
+#    "PT15M"     -- parses as "15 minutes" (where a minute is 60 seconds)
+#    "PT10H"     -- parses as "10 hours" (where an hour is 3600 seconds)
+#    "P2D"       -- parses as "2 days" (where a day is 24 hours or 86400 seconds)
+#    "P2DT3H4M"  -- parses as "2 days, 3 hours and 4 minutes"
+#    "P-6H3M"    -- parses as "-6 hours and +3 minutes"
+#    "-P6H3M"    -- parses as "-6 hours and -3 minutes"
+#    "-P-6H+3M"  -- parses as "+6 hours and -3 minutes"
+# </pre>
+expSuffix: tag({tags -> tags.cluster = 'k8s-cluster::' + tags.cluster}).service(['cluster'])
+metricPrefix: k8s_cluster
+metricsRules:
+
+
+  - name: cpu_cores
+    exp: (kube_node_status_capacity * 1000).tagEqual('resource' , 'cpu').sum(['cluster'])
+  - name: cpu_cores_allocatable
+    exp: (kube_node_status_allocatable * 1000).tagEqual('resource' , 'cpu').sum(['cluster'])
+  - name: cpu_cores_requests
+    exp: (kube_pod_container_resource_requests * 1000).tagEqual('resource' , 'cpu').sum(['cluster'])
+  - name: cpu_cores_limits
+    exp: (kube_pod_container_resource_limits * 1000).tagEqual('resource' , 'cpu').sum(['cluster'])
+
+  - name: memory_total
+    exp: kube_node_status_capacity.tagEqual('resource' , 'memory').sum(['cluster'])
+  - name: memory_allocatable
+    exp: kube_node_status_allocatable.tagEqual('resource' , 'memory').sum(['cluster'])
+  - name: memory_requests
+    exp: kube_pod_container_resource_requests.tagEqual('resource' , 'memory').sum(['cluster'])
+  - name: memory_limits
+    exp: kube_pod_container_resource_limits.tagEqual('resource' , 'memory').sum(['cluster'])
+
+  - name: storage_total
+    exp: kube_node_status_capacity.tagEqual('resource' , 'ephemeral_storage').sum(['cluster'])
+  - name: storage_allocatable
+    exp: kube_node_status_allocatable.tagEqual('resource' , 'ephemeral_storage').sum(['cluster'])
+
+  - name: node_total
+    exp: kube_node_info.sum(['cluster'])
+  - name: node_status
+    exp: kube_node_status_condition.valueEqual(1).tagMatch('status' , 'true|unknown').sum(['cluster' , 'node' ,'condition'])
+
+  - name: namespace_total
+    exp: kube_namespace_labels.sum(['cluster'])
+
+  - name: deployment_total
+    exp: kube_deployment_labels.sum(['cluster'])
+  - name: deployment_status
+    exp: kube_deployment_status_condition.valueEqual(1).tagMatch('condition' , 'Available').sum(['cluster' , 'deployment' ,'condition' , 'status']).tag({tags -> tags.remove('condition')})
+  - name: deployment_spec_replicas
+    exp: kube_deployment_spec_replicas.sum(['cluster' , 'deployment'])
+
+  - name: service_total
+    exp: kube_service_info.sum(['cluster'])
+  - name: service_pod_status
+    exp: kube_pod_status_phase.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').valueEqual(1).sum(['cluster' , 'service' , 'phase'])
+
+  - name: pod_total
+    exp: kube_pod_info.sum(['cluster'])
+  - name: pod_status_not_running
+    exp: kube_pod_status_phase.valueEqual(1).tagNotMatch('phase' , 'Running').sum(['cluster' , 'pod' , 'phase'])
+
+  - name: container_total
+    exp: kube_pod_container_info.sum(['cluster'])
+  - name: pod_status_waiting
+    exp: kube_pod_container_status_waiting_reason.valueEqual(1).sum(['cluster' , 'pod' , 'container' , 'reason'])
+  - name: pod_status_terminated
+    exp: kube_pod_container_status_terminated_reason.valueEqual(1).sum(['cluster' , 'pod' , 'container' , 'reason'])
--- a/oap-server/server-bootstrap/src/main/resources/otel-oc-rules/k8s-node.yaml
+++ b/oap-server/server-bootstrap/src/main/resources/otel-oc-rules/k8s-node.yaml
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This will parse a textual representation of a duration. The formats
+# accepted are based on the ISO-8601 duration format {@code PnDTnHnMn.nS}
+# with days considered to be exactly 24 hours.
+# <p>
+# Examples:
+# <pre>
+#    "PT20.345S" -- parses as "20.345 seconds"
+#    "PT15M"     -- parses as "15 minutes" (where a minute is 60 seconds)
+#    "PT10H"     -- parses as "10 hours" (where an hour is 3600 seconds)
+#    "P2D"       -- parses as "2 days" (where a day is 24 hours or 86400 seconds)
+#    "P2DT3H4M"  -- parses as "2 days, 3 hours and 4 minutes"
+#    "P-6H3M"    -- parses as "-6 hours and +3 minutes"
+#    "-P6H3M"    -- parses as "-6 hours and -3 minutes"
+#    "-P-6H+3M"  -- parses as "+6 hours and -3 minutes"
+# </pre>
+
+expSuffix: tag({tags -> tags.cluster = 'k8s-cluster::' + tags.cluster}).instance(['cluster'] , ['node'])
+metricPrefix: k8s_node
+metricsRules:
+
+  - name: cpu_cores
+    exp: (kube_node_status_capacity * 1000).tagEqual('resource' , 'cpu').sum(['cluster' , 'node'])
+  - name: cpu_usage
+    exp: (container_cpu_usage_seconds_total * 1000).tagEqual('id' , '/').sum(['cluster' , 'node']).rate('PT1M')
+  - name: cpu_cores_allocatable
+    exp: (kube_node_status_allocatable * 1000).tagEqual('resource' , 'cpu').sum(['cluster' , 'node'])
+  - name: cpu_cores_requests
+    exp: (kube_pod_container_resource_requests * 1000).tagEqual('resource' , 'cpu').sum(['cluster' , 'node'])
+  - name: cpu_cores_limits
+    exp: (kube_pod_container_resource_limits * 1000).tagEqual('resource' , 'cpu').sum(['cluster' , 'node'])
+
+  - name: memory_total
+    exp: kube_node_status_capacity.tagEqual('resource' , 'memory').sum(['cluster' , 'node'])
+  - name: memory_allocatable
+    exp: kube_node_status_allocatable.tagEqual('resource' , 'memory').sum(['cluster' , 'node'])
+  - name: memory_requests
+    exp: kube_pod_container_resource_requests.tagEqual('resource' , 'memory').sum(['cluster' , 'node'])
+  - name: memory_limits
+    exp: kube_pod_container_resource_limits.tagEqual('resource' , 'memory').sum(['cluster' , 'node'])
+
+  - name: memory_usage
+    exp: container_memory_working_set_bytes.tagEqual('id' , '/').sum(['cluster' , 'node'])
+
+
+  - name: storage_total
+    exp: kube_node_status_capacity.tagEqual('resource' , 'ephemeral_storage').sum(['cluster' , 'node'])
+  - name: storage_allocatable
+    exp: kube_node_status_allocatable.tagEqual('resource' , 'ephemeral_storage').sum(['cluster' , 'node'])
+
+  - name: node_status
+    exp: kube_node_status_condition.valueEqual(1).tagMatch('status' , 'true|unknown').sum(['cluster' , 'node' ,'condition'])
+
+  - name: pod_total
+    exp: kube_pod_info.sum(['cluster' , 'node'])
+
+  - name: network_receive
+    exp: container_network_receive_bytes_total.sum(['cluster' , 'node']).irate()
+  - name: network_transmit
+    exp: container_network_transmit_bytes_total.sum(['cluster' , 'node']).irate()
--- a/oap-server/server-bootstrap/src/main/resources/otel-oc-rules/k8s-service.yaml
+++ b/oap-server/server-bootstrap/src/main/resources/otel-oc-rules/k8s-service.yaml
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This will parse a textual representation of a duration. The formats
+# accepted are based on the ISO-8601 duration format {@code PnDTnHnMn.nS}
+# with days considered to be exactly 24 hours.
+# <p>
+# Examples:
+# <pre>
+#    "PT20.345S" -- parses as "20.345 seconds"
+#    "PT15M"     -- parses as "15 minutes" (where a minute is 60 seconds)
+#    "PT10H"     -- parses as "10 hours" (where an hour is 3600 seconds)
+#    "P2D"       -- parses as "2 days" (where a day is 24 hours or 86400 seconds)
+#    "P2DT3H4M"  -- parses as "2 days, 3 hours and 4 minutes"
+#    "P-6H3M"    -- parses as "-6 hours and +3 minutes"
+#    "-P6H3M"    -- parses as "-6 hours and -3 minutes"
+#    "-P-6H+3M"  -- parses as "+6 hours and -3 minutes"
+# </pre>
+expSuffix: tag({tags -> tags.cluster = 'k8s-cluster::' + tags.cluster}).endpoint(['cluster'] , ['service'])
+metricPrefix: k8s_service
+metricsRules:
+
+  - name: pod_total
+    exp: kube_pod_info.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service'])
+
+  - name: cpu_cores_requests
+    exp: (kube_pod_container_resource_requests * 1000).retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').tagEqual('resource' , 'cpu').sum(['cluster' , 'service'])
+  - name: cpu_cores_limits
+    exp: (kube_pod_container_resource_limits * 1000).retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').tagEqual('resource' , 'cpu').sum(['cluster' , 'service'])
+  - name: memory_requests
+    exp: kube_pod_container_resource_requests.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').tagEqual('resource' , 'memory').sum(['cluster' , 'service'])
+  - name: memory_limits
+    exp: kube_pod_container_resource_limits.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').tagEqual('resource' , 'memory').sum(['cluster' , 'service'])
+
+  - name: pod_status
+    exp: kube_pod_status_phase.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').valueEqual(1).sum(['cluster' , 'service' , 'pod' , 'phase'])
+  - name: pod_status_waiting
+    exp: kube_pod_container_status_waiting_reason.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').valueEqual(1).sum(['cluster' , 'service' , 'pod' ,  'container' , 'reason'])
+  - name: pod_status_terminated
+    exp: kube_pod_container_status_terminated_reason.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').valueEqual(1).sum(['cluster' , 'service' , 'pod' ,  'container' , 'reason'])
+  - name: pod_status_restarts_total
+    exp: kube_pod_container_status_restarts_total.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod'])
+
+  - name: pod_cpu_usage
+    exp: (container_cpu_usage_seconds_total * 1000).tagNotEqual('pod' , '').retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod']).rate('PT1M')
+  - name: pod_memory_usage
+    exp: container_memory_working_set_bytes.retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod'])
+
+  - name: pod_network_receive
+    exp: container_network_receive_bytes_total.tagNotEqual('pod' , '').retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod']).irate()
+  - name: pod_network_transmit
+    exp: container_network_transmit_bytes_total.tagNotEqual('pod' , '').retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod']).irate()
+  - name: pod_fs_usage
+    exp: container_fs_usage_bytes.tagNotEqual('pod' , '').retagByK8sMeta('service' , K8sRetagType.Pod2Service , 'pod' , 'namespace').tagNotEqual('service' , '').sum(['cluster' , 'service' , 'pod'])
--- a/oap-server/server-bootstrap/src/main/resources/ui-initialized-templates/k8s.yml
+++ b/oap-server/server-bootstrap/src/main/resources/ui-initialized-templates/k8s.yml